#install.packages("FNN")
#install.packages("ggrepel")
#install.packages("yardstick")
#install.packages("igraph")
#install.packages("caret")
#update.packages("caret")
#install.packages("reactable")

library(tidyverse)
library(sf)
library(raster)
library(knitr)
library(kableExtra)
library(tidycensus)
library(tigris)
library(FNN)
library(caret)
library(yardstick)
library(pscl)
library(plotROC) 
library(ggrepel)
library(pROC)
library(grid)
library(gridExtra)
library(viridis)
library(igraph)
library(maptools)
library(reactable)
library(classInt)

plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  # Set the entire chart region to blank
  panel.background=element_blank(),
  plot.background=element_blank(),
  #panel.border=element_rect(colour="#F0F0F0"),
  # Format the grid
  panel.grid.major=element_line(colour="#D0D0D0",size=.75),
  axis.ticks=element_blank())

mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))

#Colors: Viridis Plasma (option "C")
gray <- "#D3D3D3"
darkGray <- "#505251"
highlight <- "#0D0887FF"
palette2 <- c("#FA9E3BFF", "#0D0887FF")
palette3 <- c("#FDC926FF", "#D8576BFF", "#0D0887FF")
palette4 <- c("#FA9E3BFF", "#D8576BFF", "#9C179EFF", "#0D0887FF")
palette5 <- c("#FDC926FF", "#ED7953FF", "#D8576BFF", "#9C179EFF", "#0D0887FF")
palette10 <- c("#F0F921FF", "#FDC926FF", "#FA9E3BFF", "#ED7953FF", "#D8576BFF",
               "#BD3786FF", "#9C179EFF", "#7301A8FF", "#47039FFF", "#0D0887FF")

#R Markdown Cheat sheet for reference - https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf
#this function converts a column in to quintiles. It is used for mapping.
quintileBreaks <- function(df,variable) {
    as.character(quantile(df[[variable]],
                          c(.01,.2,.4,.6,.8),na.rm=T))
}

#This function can be used to convert a polygon sf to centroids xy coords.
xyC <- function(aPolygonSF) {
  as.data.frame(
    cbind(x=st_coordinates(st_centroid(aPolygonSF))[,1],
          y=st_coordinates(st_centroid(aPolygonSF))[,2]))
} 

#this function convert a raster to a data frame so it can be plotted in ggplot
rast <- function(inRaster) {
  data.frame(
    xyFromCell(inRaster, 1:ncell(inRaster)), 
    value = getValues(inRaster)) }

#this function aggregates rasters
aggregateRaster <- function(inputRasterList, AtlantaMSA_fishnet) {
  #create an empty fishnet with the same dimensions as the input fishnet
  theseFishnets <- AtlantaMSA_fishnet %>% dplyr::select()
  #for each raster in the raster list
  for (i in inputRasterList) {
  #create a variable name corresponding to the ith raster
  varName <- names(i)
  #convert raster to points as an sf
    thesePoints <-
      rasterToPoints(i) %>%
      as.data.frame() %>%
      st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet)) %>%
      filter(.[[1]] == 1)
  #aggregate to the fishnet
    thisFishnet <-
      aggregate(thesePoints, AtlantaMSA_fishnet, length) %>%
      mutate(!!varName := ifelse(is.na(.[[1]]),0,1))
  #add to the larger fishnet
    theseFishnets <- cbind(theseFishnets, thisFishnet)
  }
  #output all aggregates as one large fishnet
   return(theseFishnets)
}

Introduction: This Model and Its Significance to Planners

Welcome! In this interactive R-markdown memo, we employ historical patterns of development and population trends in the Atlanta, Georgia metropolitan statistical area (“MSA”) to predict future development scenarios. Specifically, we examine the spatial relationships that exist among population growth and highway infrastructure with land development. Through these models, we explore the relationships between each of these independent variables and urban sprawl, our dependent variable.

Urban sprawl is a development scenario characterized by “leap frogging” development, a development scenario that disproportionately consumes land and environmental resources, with significant negative ecological externalities. These models will predict likely urban sprawl for the Atlanta metro area based on two scenarios. The models presented here enable planners and policy makers to preemptively develop policies to calibrate against urban sprawl, empowering municipal governments to create conscientious land allocation plans.

The model developed will predict two different scenarios: demand-side and supply-side. In the demand-side model, we attempt to understand the relationship that population growth has with urban sprawl in the MSA. Specifically, we predict the demand for development according to population growth, and how this would spatially manifest based on existing land use trends and population projections. The supply-side model will predict how future development might occur following the construction of a new, theoretical highway, supplying infrastructure. Through this scenario, we attempt to ascertain the relationship that highway development has with urban sprawl.

Let’s explore how sprawl can occur, unmitigated, based on these independent variables. We use land cover and population data from the years 2008/2009 and 2019 to predict for the year 2029.

  • Region of interest: Atlanta, GA metro

  • Years of study: 2008-2019

  • Projecting to 2029

  • Data level: fishnet grid cell

The following chunks of code will collect the prerequisite data needed for the model from the two years of study, structure the data in preparation for modeling, and run the demand - and supply- side models based on the study years’ data to project for the year 2030.

Data Gathering and Feature Engineering

First, we pull in and visualize our spatial datasets of the Atlanta MSA’s 21 counties.

And retrieve the Atlanta MSA boundary using the tigris package.

#List of the Atlanta region MPOs 21 counties
Counties <- c("Barrow", "Bartow", "Carroll", "Cherokee", 
              "Clayton", "Cobb", "Coweta", "Dawson", 
              "DeKalb", "Douglas", "Fayette", "Forsyth", 
              "Fulton", "Gwinnett", "Hall", "Henry", 
              "Newton", "Paulding", "Rockdale", "Spalding", "Walton")

#First pull all MSAs in the U.S., then search for yours and filter with the correct CBSAFP code
metro_counties <- counties(state = "GA", cb = TRUE, resolution = "500k", year = NULL) %>%
  filter(NAME %in% Counties) %>% 
  st_transform("ESRI:102267")

#Merge counties into one shapefile showing just the boundary
Atlanta_MSA <- st_union(metro_counties) %>% 
  st_transform("ESRI:102267")
#plot MSA boundary to make sure its right
ggplot() + 
  geom_sf(data = metro_counties) +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  mapTheme + 
  labs(title = "Atlanta 21 County MSA")

We then create a fishnet of the Atlanta MSA using 1000’ x 1000’ cells. A fishnet is a vector data set that will allow us to attach other information to the forthcoming raster data set on land cover.

AtlantaMSA_fishnet <- 
    st_make_grid(metro_counties, cellsize = 1000, square = TRUE) %>%
  st_sf() %>% 
  st_transform("ESRI:102267")

AtlantaMSA_fishnet <-
  AtlantaMSA_fishnet[metro_counties,]

#plot MSA boundary to make sure its right
ggplot()+
  geom_sf(data = AtlantaMSA_fishnet,
          fill = "lightgrey") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title = "Atlanta MSA Fishnet") +
  mapTheme

Land Cover

Land Cover by Year and Type

We retrieve the raster files of land cover for the Atlanta MSA. We downloaded these datasets from the Multi-Resolution Land Characteristics Consortium’s National Land Cover Database, and clipped them in ArcGIS to a rectangular boundary extent that aligned with the farthest points of the Atlanta region’s 21 county border.

##SKIP##
#load full size rasters
lc_2008 = raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/56bd35b343d39e88bd8e1fdc89957e6ccb804fe9/Data/raster/2008LC_proj.tif")
    
lc_2019 = raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/56bd35b343d39e88bd8e1fdc89957e6ccb804fe9/Data/raster/2019LC_proj.tif")

The raster files were still very large, so we down-sampled them to make their size more manageable and then clipped them precisely to the MSA boundary.

## SKIP ##

#down sample the raster to make more manageable
lc_2008 <- aggregate(lc_2008, fact = 7, fun = modal)
lc_2019 <- aggregate(lc_2019, fact = 7, fun = modal)

#crop rasters
lc_2008_crop <- crop(lc_2008, extent(metro_counties))
lc_2019_crop <- crop(lc_2019, extent(metro_counties)) 

#mask rasters
lc_2008_crop <- mask(x = lc_2008_crop, mask = metro_counties)
lc_2019_crop <- mask(x = lc_2019_crop, mask = metro_counties)

#Export downsized rasters
#writeRaster(lc_2008_crop, filename=file.path(rast_dir, "lc2008_sml.tif"), format="GTiff", overwrite=TRUE)

#writeRaster(lc_2019_crop, filename=file.path("C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/raster", "lc2019_sml.tif"), format="GTiff", overwrite=TRUE)
lc_2008 <- raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/24f08e61c5ec983b3fa39f1f3b03fb90aa3a13ee/Data/raster/lc2008_sml.tif") 

lc_2019 <- raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/24f08e61c5ec983b3fa39f1f3b03fb90aa3a13ee/Data/raster/lc2019_sml.tif") 

This allows us to visualize Atlanta’s land cover in 2008 and 2019.

grid.arrange(ncol = 1,
ggplot() +
  geom_raster(data=rast(lc_2008) %>% na.omit %>% filter(value > 0), 
              aes(x,y,fill=as.factor(value))) +
  scale_fill_viridis(discrete=TRUE,
                     name ="",
                     option = "C") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Land Cover, 2008") +
  mapTheme +
  theme(legend.direction="horizontal"),


ggplot() +
  geom_raster(data=rast(lc_2019) %>% na.omit %>% filter(value > 0), 
              aes(x, y, fill=as.factor(value))) +
  scale_fill_viridis(discrete=TRUE,
                     name ="",
                     option = "C") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
      geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Land Cover, 2019") +
  mapTheme +
  theme(legend.direction="horizontal")
)

Viewing land cover at the MSA level in 2008 and 2019 shows that change is happening at too small a scale to discern visually at this extent. Therefore, we summon the powers of R. First, we distill the many specific land cover classifications into six simple buckets: developed land, forest, farmland, wetlands, water bodies, and other undeveloped lands.

Land Cover by Type

developed08 <- lc_2008 == 21 | lc_2008 == 22 | lc_2008 == 23 | lc_2008 == 24
forest08 <- lc_2008 == 41 | lc_2008 == 42 | lc_2008 == 43 
farm08 <- lc_2008 == 81 | lc_2008 == 82 
wetlands08 <- lc_2008 == 90 | lc_2008 == 95 
otherUndeveloped08 <- lc_2008 == 52 | lc_2008 == 71 | lc_2008 == 31 
water08 <- lc_2008 == 11

developed19 <- lc_2019 == 21 | lc_2019 == 22 | lc_2019 == 23 | lc_2019 == 24
forest19 <- lc_2019 == 41 | lc_2019 == 42 | lc_2019 == 43 
farm19 <- lc_2019 == 81 | lc_2019 == 82 
wetlands19 <- lc_2019 == 90 | lc_2019 == 95 
otherUndeveloped19 <- lc_2019 == 52 | lc_2019 == 71 | lc_2019 == 31 
water19 <- lc_2019 == 11

names(developed08) <- "developed08"
names(forest08) <- "forest08"
names(farm08) <- "farm08"
names(wetlands08) <- "wetlands08"
names(otherUndeveloped08) <- "otherUndeveloped08"
names(water08) <- "water08"

names(developed19) <- "developed19"
names(forest19) <- "forest19"
names(farm19) <- "farm19"
names(wetlands19) <- "wetlands19"
names(otherUndeveloped19) <- "otherUndeveloped19"
names(water19) <- "water19"

This distilled, categorical dataset for each year is then aggregated to our fishnet of the Atlanta MSA. The resulting maps show that Atlanta is a heavily forested area. Development is vast, with its anchor in the center of the MSA, radiating outward. A significant amount of farmland exists in a thick band against the MSA’s periphery. Undeveloped areas are scattered, also mostly near the MSA’s periphery.

theRasterList08 <- c(developed08, forest08, farm08, wetlands08, otherUndeveloped08, water08)

theRasterList19 <- c(developed19, forest19, farm19, wetlands19, otherUndeveloped19, water19)

aggregatedRasters08 <-
  aggregateRaster(theRasterList08, AtlantaMSA_fishnet) %>%
  dplyr::select(developed08, forest08, farm08, wetlands08, otherUndeveloped08, water08) %>%
  mutate_if(is.numeric, as.factor)

aggregatedRasters19 <-
  aggregateRaster(theRasterList19, AtlantaMSA_fishnet) %>%
  dplyr::select(developed19, forest19, farm19, wetlands19, otherUndeveloped19, water19) %>%
  mutate_if(is.numeric, as.factor)

rasters08_map <- aggregatedRasters08 %>%
  gather(var,value,developed08:water08) %>%
  st_cast("POLYGON") %>%    #just to make sure no weird geometries slipped in
  mutate(X = xyC(.)$x,
         Y = xyC(.)$y) %>%
  ggplot() +
    geom_sf(data=metro_counties) +
    geom_point(aes(X,Y, colour=as.factor(value))) +
    facet_wrap(~var) +
    scale_colour_manual(values = c(gray, highlight),
                        labels=c("False","True"),
                        name = "") +
    geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .25) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .5) +
    labs(title = "Land Cover Types, 2008",
         subtitle = "As fishnet centroids") +
   mapTheme +
   theme()  +
   theme(legend.position = "none")

rasters08_map

Land Cover Change

To identify areas with land cover change, we use the raster data sets and reclassify the land cover types as numerical values, where developed lands have the value of 1 and undeveloped lands have the value of 0, for each of the years’ data sets. We are then able to add these data sets together on a cell-by-cell basis to understand where development has occurred in 2019 and where it was not in 2008. When added together, the cells with values of 1 represent where new development has occurred; a value of 2 shows where development was in 2008 and remained in 2019, and a value of 0 shows land that was and remains undeveloped. This function is like using raster calculator in GIS.

#creat matrix for reclassifying land cover
reclassMatrix <- 
  matrix(c(
    0,12,0,
    12,24,1,
    24,Inf,0),
  ncol=3, byrow=T)

reclassMatrix
#reclassify land cover change using reclass matrix

lc_2008r <- 
  reclassify(lc_2008, reclassMatrix)
lc_2019r <- 
  reclassify(lc_2019, reclassMatrix)

lc_2008r[lc_2008r < 1] <- 0
names(lc_2008r) <- "dev08"

lc_2019r[lc_2019r < 1] <- 0
names(lc_2019r) <- "dev19"

development_change <- lc_2008r + lc_2019r

# convert the raster to a vector
dev_change_vec <- as.vector(development_change)

# create the histogram plot with ggplot2
ggplot(data.frame(value = dev_change_vec), aes(x=value)) + 
  geom_histogram(binwidth=.5, fill=highlight, color="white") +
  labs(title="Development Change by Type (2008-2019)",
       subtitle = "0 = Remained Undeveloped, 1 = New Development, 2 = Already Developed",
       x="Development Change",
       y="Frequency") +
  plotTheme

This shows us that most cells in the MSA remained undeveloped from 2008 to 2019, many of them were already developed, and a few of the previously undeveloped cells converted to developed. We can then visualize where the new development cells with the 1 value are, which shows that new development has been scattered throughout the MSA.

development_change[development_change != 1] <- NA

ggplot() +
  geom_sf(data=Atlanta_MSA) +
  geom_raster(data=rast(development_change) %>% na.omit, 
              aes(x,y,fill=as.factor(value))) +
  scale_fill_viridis(discrete=TRUE,
                     option = "C",
                     name ="Cells \nwith Land Cover\nChange") + 
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title="Development Land Cover Change (2008-2019)",
       subtitle = "Atlanta metro area | Raster cells") +
  mapTheme +
   theme(legend.position = "none")

We then join this dataset, mapping new development, to our fishnet. This has the effect of magnifying where the development change has happened a bit.

dev_change <- reclassify(development_change, reclassMatrix)
                         
dev_change[development_change < 1] <- NA

names(dev_change) <- "development_change"


changePoints <-
  rasterToPoints(dev_change) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet))

dev_change_fishnet <- 
  aggregate(changePoints, AtlantaMSA_fishnet, sum) %>%
  mutate(development_change = ifelse(is.na(development_change),0,1),
         development_change = as.factor(development_change))

ggplot() +
  geom_point(data=dev_change_fishnet, 
             aes(x=xyC(dev_change_fishnet)$x, y=xyC(dev_change_fishnet)$y, colour=development_change)) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("No Change","New Development"),
                      name = "") +
    geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "New Development (2019)", subtitle = "Atlanta metro area | Fishnet centroids") +
  mapTheme

#dev_change_fishnet <- st_write(dev_change_fishnet, "C:/Users/Lindsey/Desktop/CPLN675_Land_Use_Modeling_Desktop/Final/LUEM_A5_Repo/Data/geojson/dev_change_fishnet.geojson")

Census Data: Population

Next, we set up our MSA population data, which will prepare us for the demand-side model. The next two chunks download the population data for 2009 and 2019 from the American Community Survey 5-year Estimates on a census tract level.

We then visualize the population for the two years by the respective census tract boundaries for each of the years.

grid.arrange(
ggplot() +
  geom_sf(data = Pop09, aes(fill=factor(ntile(pop09,5))), colour=NA) +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  scale_fill_manual(values = palette5,
                    labels=quintileBreaks(Pop09,"pop09"),
                   name="Quintile\nBreaks") +
  labs(title="2009 Population\nAtlanta 21-County MSA",
       subtitle="By Census Tract") +
  mapTheme,

ggplot() +
  geom_sf(data = Pop19, aes(fill=factor(ntile(pop19,5))), colour=NA) +
  scale_fill_manual(values = palette5,
                    labels=quintileBreaks(Pop19,"pop19"),
                    name="Quintile\nBreaks") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title="2019 Population\nAtlanta 21-County MSA",
       subtitle="By Census Tract") +
  mapTheme, ncol=2)

We then reconcile the tract boundaries with the fishnet grid cells to be able to conduct analysis and build models with our fishnet data set.

### Breaking here on select(fishnetID)
AtlantaMSA_fishnet <-
  AtlantaMSA_fishnet %>%
  rownames_to_column("fishnetID") %>% 
  mutate(fishnetID = as.numeric(fishnetID)) %>%
  dplyr::select(fishnetID)

fishnetPop09 <-
  st_interpolate_aw(Pop09["pop09"], AtlantaMSA_fishnet, extensive=TRUE) %>%
  as.data.frame(.) %>%
  rownames_to_column(var = "fishnetID") %>%
  left_join(AtlantaMSA_fishnet %>%
              mutate(fishnetID = as.character(fishnetID)),
            ., by=c("fishnetID"='fishnetID')) %>% 
  mutate(pop09 = replace_na(pop09,0)) %>%
  dplyr::select(pop09)

fishnetPop19 <-
  st_interpolate_aw(Pop19["pop19"], AtlantaMSA_fishnet, extensive=TRUE) %>%
  as.data.frame(.) %>%
  rownames_to_column(var = "fishnetID") %>%
  left_join(AtlantaMSA_fishnet %>%
              mutate(fishnetID = as.character(fishnetID)),
            ., by=c("fishnetID"='fishnetID')) %>% 
  mutate(pop19 = replace_na(pop19,0)) %>%
  dplyr::select(pop19)

fishnetPop <- 
  cbind(fishnetPop09, fishnetPop19) %>%
  dplyr::select(pop09,pop19) %>%
  mutate(pop_Change = pop19 - pop09)

We can then compare population by tract vs population by grid cell.

grid.arrange(
ggplot() +
  geom_sf(data=Pop19, aes(fill=factor(ntile(pop19,5))),colour=NA) +
  scale_fill_manual(values = palette5,
                    labels=substr(quintileBreaks(Pop19,"pop19"),1,4),
                   name="Quintile\nBreaks") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title="2019 Population\nAtlanta 21-County MSA",
       subtitle="Represented as tracts; Boundaries omitted") +
  mapTheme,

ggplot() +
  geom_sf(data=fishnetPop, aes(fill=factor(ntile(pop19,5))), colour=NA) +
  scale_fill_manual(values = palette5,
                   labels=substr(quintileBreaks(fishnetPop,"pop19"),1,4),
                   name="Quintile\nBreaks") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title="2019 Population\nAtlanta 21-County MSA",
       subtitle="Represented as fishnet gridcells; Boundaries omitted") +
  mapTheme, ncol=2)

Highway Distance

Next, we prepare our supply-side model by loading in spatial data for Atlanta’s existing highways.

## Reading layer `Major_Roads' from data source 
##   `https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/8b689651286bb4ff28554095bd39ce1bd87655e4/Data/geojson/Major_Roads.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 2669 features and 6 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -85.35708 ymin: 32.84459 xmax: -83.50588 ymax: 34.57842
## Geodetic CRS:  WGS 84

This suggests there may be an association between proximity to highways and new development. To better understand this relationship, we calculate how far each fishnet cell is from a highway. We do this by converting the highway layer to raster, converting the raster to points, and then calculating the mean distance from a highway for each grid cell.

#SKIP#
#create a raster layer of highways
highway_raster <- 
  as(AtlantaHighways,'Spatial') %>%
  rasterize(.,emptyRaster)

#Export rasters

rast_dir = paste(dir_ct, 'raster', sep='/')

#writeRaster(highway_raster, filename=file.path(rast_dir, "highway_raster.tif"), format="GTiff", overwrite=TRUE)

This visualizes each cell’s distance from a highway.

Calculating the Spatial Lag of Development

We measure accessibility by way of a spatial lag, hypothesizing that new development is a function of distance to existing development. The shorter the distance, the more accessible a grid cell is to existing development. We measure this by calculating the average distance from each grid cell to its 2 nearest developed neighboring grid cells in 2009.

nn_function <- function(measureFrom,measureTo,k) {
  #convert the sf layers to matrices
  measureFrom_Matrix <-
    as.matrix(measureFrom)
  measureTo_Matrix <-
    as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo, measureFrom, k)$nn.dist
    output <-
    as.data.frame(nn) %>%
    rownames_to_column(var = "thisPoint") %>%
    gather(points, point_distance, V1:ncol(.)) %>%
    arrange(as.numeric(thisPoint)) %>%
    group_by(thisPoint) %>%
    summarize(pointDistance = mean(point_distance)) %>%
    arrange(as.numeric(thisPoint)) %>% 
    dplyr::select(-thisPoint) %>%
    pull()
  
  return(output)  
}
AtlantaMSA_fishnet$lagDevelopment08 <-
    nn_function(xyC(AtlantaMSA_fishnet),
                xyC(filter(aggregatedRasters08, developed08==1)),
                2)

# compute natural breaks for "lagDevelopment08"
breaks <- classIntervals(AtlantaMSA_fishnet$lagDevelopment08, n=4, style="jenks")

#Plot
ggplot() +
  geom_sf(data=metro_counties) +
  geom_point(data=AtlantaMSA_fishnet, 
             aes(x=xyC(AtlantaMSA_fishnet)[,1], y=xyC(AtlantaMSA_fishnet)[,2], 
                 colour=cut(lagDevelopment08, breaks$brks)), size=1.5) +
  scale_colour_manual(values=palette5,
                      labels=format(breaks$brks, nsmall=1),
                      name="Distance (ft)") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Spatial Lag to 2008 Development",
       subtitle = "As fishnet centroids") +
  mapTheme

This shows most of the area in the MSA’s inner counties was near existing development in 2008. It was only the counties at the outer fringes that still had a few areas relatively distant from development.

Create Final Dataset

We then aggregate all these data to a final fishnet data set for feeding into our model. This fishnet includes columns which indicate, cell-by-cell, if there has been development change, land use type, the cell’s distance to highways, the cell’s distance to existing development, and its population in both years.

dat <-  
  cbind(
    AtlantaMSA_fishnet, dev_change_fishnet, highwayPoints_fishnet, fishnetPop, aggregatedRasters08) %>%
  dplyr::select(development_change, developed08, forest08, farm08, wetlands08, otherUndeveloped08, water08,
                pop09, pop19, pop_Change, distance_highways, lagDevelopment08) %>%
  st_join(metro_counties) %>%
  mutate(developed19 = ifelse(development_change  == 1 & developed08 == 1, 0, 1)) %>% #charlie to confirm if the else value should be 1 or 'developed'
  filter(water08 == 0) 

Data Exploration

Now that all of our data are assembled, we explore how each factor uncovered thus far is related to new development. This plot displays distance to highways and the spatial lag of development as continuous variables. This shows us that new development has been, on average, closer to highways, and closer to existing development in aggregate, than cells that had no change in land cover. This supports our previous hypothesis that new development might be occurring closer to highways and to previously developed areas.

dat %>%
  dplyr::select(distance_highways,lagDevelopment08,development_change) %>% 
  gather(Variable, Value, -development_change, -geometry) %>%
  ggplot(., aes(development_change, Value, fill=development_change)) + 
    geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
    facet_wrap(~Variable) +
    scale_fill_manual(values = palette2,
                      labels=c("No Change","New Development"),
                      name="") +
    labs(title="New Development as a Function of the Continuous Variables") +
    plotTheme

This next plot shows that new development has, on average, occurred in cells where the population is greater or growing. From this, we learn that development in Atlanta MSA is also positively correlated with the amount of population and population growth in an area.

dat %>%
  dplyr::select(pop09,pop19,pop_Change,development_change) %>%
  gather(Variable, Value, -development_change, -geometry) %>%
  ggplot(., aes(development_change, Value, fill=development_change)) + 
    geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
    facet_wrap(~Variable) +
    scale_fill_manual(values = palette2,
                      labels=c("No Change","New Development"),
                      name="") +
    labs(title="New Development as a Function of Factor Variables") +
    plotTheme

From the above data exploration, we can see that development is positively correlated with proximity to highways and development, as well as with existing and growing population. However, this leaves us wondering which types of land cover are most likely to convert to developed land?

The conversion table shows us that forested areas in 2008 were the most likely to have been developed by 2019. This shows that Atlanta has a preference for developing woodlands first, with areas adjacent to those that are already developed as a close second.

dat %>%
  dplyr::select(development_change:otherUndeveloped08,developed08) %>%
  gather(Land_Cover_Type, Value, -development_change, -geometry) %>%
   st_set_geometry(NULL) %>%
     group_by(development_change, Land_Cover_Type) %>%
     summarize(n = sum(as.numeric(Value))) %>%
     ungroup() %>%
    mutate(Conversion_Rate = paste0(round(100 * n/sum(n), 2), "%")) %>%
    filter(development_change == 1) %>%
  dplyr::select(Land_Cover_Type,Conversion_Rate) %>%
  reactable(theme = reactableTheme(style = list(fontFamily = "Arial, sans-serif", fontSize = "1.0rem")))


Binary Logistic Regression - Predicting for 2019

Now that we have gathered a number of factors we hypothesized to be related to new development, and have proved that they have at least some association with development, we can build and train a model to predict new development based on them.

We chose to use binomial logistic regression for this assignment because it is a type of statistical model that is well suited to analyzing relationships between a binary response variable and one or more predictor variables. It estimates the probability of the response variable taking one of two possible values (in this case developed or not developed) based on the values of the predictor variables.

To predict the likelihood of future development, we use data from 2008/2009 to create a series of logistic regression models to predict land use change in 2019. We then evaluate the performance of these models to select our best model for predicting development in 2029.

Model Building

The first step of this process is randomly splitting the data in half to create a testing data set with one half and a training data set with the other.

set.seed(3456)
trainIndex <- 
  createDataPartition(dat$developed08, p = .50,
                                  list = FALSE,
                                  times = 1)
datTrain <- dat[ trainIndex,]
datTest  <- dat[-trainIndex,]

We then build a series of binary logistic regression models based on our independent variables.

The six models we created are numbered to reflect increasing complexity. While Model 1 concentrates exclusively on land cover type and whether development change occurred between the two time periods, later models incorporate the spatial lag of development, population elements, and our distance variables.

Model1 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08, 
              family="binomial"(link="logit"), data = datTrain)

Model2 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08, 
              family="binomial"(link="logit"), data = datTrain)
              
Model3 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop09, 
              family="binomial"(link="logit"), data = datTrain)          
              
Model4 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop09 + pop19, 
              family="binomial"(link="logit"), data = datTrain)              
            
Model5 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop09 + pop19 + pop_Change,
              family="binomial"(link="logit"), data = datTrain)              
              
Model6 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop_Change + pop09 + pop19 + distance_highways, 
              family="binomial"(link="logit"), data = datTrain) 


modelList <- paste0("Model", 1:6)

map_dfc(modelList, function(x)pR2(get(x)))[4,] %>%
  setNames(paste0("Model",1:6)) %>%
  gather(Model,McFadden) %>%
  ggplot(aes(Model,McFadden)) +
    geom_bar(stat="identity", fill = highlight) +
    labs(title= "McFadden R-Squared by Model") +
    plotTheme
## fitting null model for pseudo-r2
## fitting null model for pseudo-r2
## fitting null model for pseudo-r2
## fitting null model for pseudo-r2
## fitting null model for pseudo-r2
## fitting null model for pseudo-r2

We then examine the McFadden R-Squared of the six models to select our best final model. Model 1 clearly performs more poorly than the others. However, models 2-4 also perform moderately less well than the remaining models. Therefore, we eliminate models 1-4 and choose Model 6 as our final model because it performs the best and allows us to explore the effects of all of our variables of interest.

We then use the predicted values from Model 6 to create a histogram of the model’s test set predicted probabilities. This shows us that a high density of cells have roughly a 40% likelihood of development. This makes sense given the sprawling nature of the Atlanta MSA’s existing development patterns, which are acreage intensive. It also makes sense given the map of new development we plotted earlier, in which it seems feasible that roughly /~30-40% of the MSA was covered by new development in 2019.

testSetProbs <- 
  data.frame(class = datTest$development_change,
             probs = predict(Model6, datTest, type="response")) 

chart_testProbs <-  
ggplot(testSetProbs, aes(probs)) +
  geom_density(aes(fill=class), alpha=0.75) +
  scale_fill_manual(values = palette2,
                    labels=c("No Change","New Development")) +
  labs(title = "Histogram of test set predicted probabilities",
       x="Predicted Probabilities",y="Density") +
  plotTheme

chart_testProbs

Accuracy

This next segment tests our model’s:

  1. Sensitivity: the extent to which it accurately predicts new development, or “true positives”, and:
  2. Specificity, the extent to which it accurately predicts “true negatives.”

Our ultimate goal with a land cover change model is to optimize the balance between the model’s predictions with regard to both sensitivity and specificity. Meaning, we want the model to be capable of predicting where development occurs so that we can plan for those areas, but we also want it to accurately predict where development would not occur, to inform our other land use planning.

At the first two thresholds specified below, 5% and 17%, we can see that the 5% threshold very accurately predicts the “true negatives,” where development does not occur. The threshold of 17%, however, performs better on the “true positive” rate of predicted v. actual development, that has occurred. We found, however, that a threshold of around 34% gives the best balance between the trade-offs of sensitivity and specificity. We chose 34% because it gives us the highest overall accuracy, and allows us to keep both our sensitivity and specificity over 50%.

options(yardstick.event_first = FALSE)

testSetProbs <- 
  testSetProbs %>% 
  mutate(predClass_05 = as.factor(ifelse(testSetProbs$probs >= 0.05 ,1,0)),
         predClass_17 = as.factor(ifelse(testSetProbs$probs >= 0.17 ,1,0)),
         predClass_34 = as.factor(ifelse(testSetProbs$probs >= 0.34 ,1,0))) 

testSetProbs %>%
  dplyr::select(-probs) %>%
  gather(Variable, Value, -class) %>%
  group_by(Variable) %>%
  summarize(Sensitivity = round(yardstick::sens_vec(class,factor(Value)),2),
            Specificity = round(yardstick::spec_vec(class,factor(Value)),2),
            Accuracy = round(yardstick::accuracy_vec(class,factor(Value)),2)) %>% 
  reactable(theme = reactableTheme(style = list(fontFamily = "Arial, sans-serif", fontSize = "1.0rem")))

We then convert these true/false positive/negative indicators to factors in order to map them.

predsForMap <-         
  dat %>%
    mutate(probs = predict(Model6, dat, type="response") ,
           Threshold_5_Pct = as.factor(ifelse(probs >= 0.05 ,1,0)),
           Threshold_17_Pct =  as.factor(ifelse(probs >= 0.17 ,1,0)),
           Threshold_34_Pct =  as.factor(ifelse(probs >= 0.34 ,1,0))) %>%
    dplyr::select(development_change,Threshold_5_Pct,Threshold_17_Pct, Threshold_34_Pct) %>%
    gather(Variable,Value, -geometry) %>%
    st_cast("POLYGON")

ggplot() +
  geom_point(data=predsForMap, aes(x=xyC(predsForMap)[,1], y=xyC(predsForMap)[,2], colour=Value)) +
  facet_wrap(~Variable) +
  scale_colour_manual(values = c(gray, highlight), labels=c("No Change","New Development"),
                      name="") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title="Development Predictions - Low Threshold") + 
  mapTheme +
  theme(legend.position = "bottom")

These maps show that the 34% nicely captures the MSA’s overall pattern of development while minimizing the tendency to over-predict development that the 17% and 5% thresholds share (the 5% threshold wildly so).

To better understand the trade-offs between accurately predicting false positives and false negatives, we can run our predictions through a confusion matrix to visualize the true positives (“TrueP”) and true negatives (“TrueN”) in the charts below, for each of the thresholds.

dat <- dat %>% 
  mutate(probs = predict(Model6, dat, type="response"))
  
ConfusionMatrix.metrics <-
  dat %>%
    mutate(Threshold_5_Pct = as.factor(ifelse(probs >= 0.05 ,1,0)),
           Threshold_17_Pct =  as.factor(ifelse(probs >= 0.17 ,1,0)),
          Threshold_34_Pct =  as.factor(ifelse(probs >= 0.34 ,1,0))) %>%
    mutate(TrueP_05 = ifelse(development_change  == 1 & Threshold_5_Pct == 1, 1,0),
           TrueN_05 = ifelse(development_change  == 0 & Threshold_5_Pct == 0, 1,0),
           TrueP_17 = ifelse(development_change  == 1 & Threshold_17_Pct == 1, 1,0),
           TrueN_17 = ifelse(development_change  == 0 & Threshold_17_Pct == 0, 1,0),
           TrueP_34 = ifelse(development_change  == 1 & Threshold_34_Pct == 1, 1,0),
           TrueN_34 = ifelse(development_change  == 0 & Threshold_34_Pct == 0, 1,0)) %>%
    dplyr::select(., starts_with("True")) %>%
    gather(Variable, Value, -geometry) %>%
    st_cast("POLYGON") 

ggplot(data=ConfusionMatrix.metrics) +
  geom_point(aes(x=xyC(ConfusionMatrix.metrics)[,1], 
                 y=xyC(ConfusionMatrix.metrics)[,2], colour = as.factor(Value))) +
  facet_wrap(~Variable) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("Incorrect","Correct"),
                      name="") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title="Development Predictions - Low Threshold") +
  mapTheme +
  theme(legend.position = "bottom")

Mapping the sensitivity (true positives) and specificity (true negatives) of these thresholds shows in further detail that the 34% threshold predicts noticeably true negatives than the other thresholds while predicting only slightly less true positives. Therefore, we select 34% as the probability threshold at which to classify land as likely to be developed in the remainder of this project.

Planning Scenarios

Now that we have developed a predictive model for new development, we can use it to test the effects of potential future scenarios. We first use the model to predict how a demand-side change (projected population) might affect development in 2029. We then use the model to predict how a supply-side change (a new highway) might affect development

Scenario 1: Demand-side Change Forecast

In this demand-side projection, we will use population projections for the Atlanta MSA counties and distribute the population among our fishnet cells. The goal is to visualize change and plan for growth scenarios based on how anticipated future population might drive development.

First, we replace the lag development 2008 column with the distance from development in 2019.

dat <-
  dat %>%
  mutate(lagDevelopment08 = nn_function(xyC(.), xyC(filter(., developed19 == 1)),2))

Next, we look at projected population by county in 2029 based on population estimates from the Georgia Data Analytics Center.

#County projections 2029
#Barrow 96977 -
#Bartow 119594 - 
#Carroll 131577 -
#Cherokee  301752-
#Clayton  321410-
#Cobb  830671-
#Coweta  172341 -
#Dawson     32572-
#DeKalb  822154-
#Douglas  160635-
#Fayette  128627-
#Forsyth   316165-
#Fulton  1198334-
#Gwinnett   1044026-
#Hall       232720-
#Henry   278739-
#Newton 130559-
#Paulding 208196-
#Rockdale   95574-
#Spalding   70974-
#Walton     109205-

countyPopulation_2029 <- 
  data.frame(
   NAME = Counties,
   county_projection_2029 = 
     c(96977,119594,131577,301752,321410,830671,172341,32572,822154,160635,128627,
       316165,1198334,1044026,232720,278739,130559,208196,95574,70974,109205)) %>%
   left_join(
     dat %>%
       st_set_geometry(NULL) %>%
       group_by(NAME) %>%
       summarize(county_population_2019 = round(sum(pop19))))

countyPopulation_2029 %>%
  gather(Variable,Value, -NAME) %>%
  ggplot(aes(reorder(NAME,-Value),Value)) +
  geom_bar(aes(fill=Variable), stat = "identity", position = "dodge") +
  scale_fill_manual(values = palette2,
                    labels=c("2019","2029"),
                    name="Population") +
  labs(title="Population Change by County: 2019 - 2029",
       x="County", y="Population") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  plotTheme

We see that Fulton, Gwinnett, DeKalb, and Cobb counties are projected to experience the largest population increases between 2019 and 2029.

Predicting Development Demand (based on projected population)

Now, we will take the county-level population projections and distribute them across the study area. We do this by weighting it proportionally according to a grid cell’s 2019 population and then subtracting 2019 pop to get pop_change.

dat_infill <-
  dat %>%
  #calculate population change
    left_join(countyPopulation_2029) %>%
    mutate(proportion_of_county_pop = pop19 / county_population_2019,
           pop_2029.infill = proportion_of_county_pop * county_projection_2029,
           pop_Change = round(pop_2029.infill - pop19),2) %>%
    dplyr::select(-county_projection_2029, -county_population_2019, 
                  -proportion_of_county_pop, -pop_2029.infill) %>%
  #predict for 2029
    mutate(predict_2029.infill = predict(Model6,. , type="response"))

map_pred29 <- 
dat_infill %>%
  ggplot() +  
  geom_point(aes(x=xyC(dat_infill)[,1], y=xyC(dat_infill)[,2], colour = factor(ntile(predict_2029.infill,5)))) +
  scale_colour_manual(values = palette5,
                    labels=substr(quintileBreaks(dat_infill,"predict_2029.infill"),1,4),
                    name="% Likelihood\nof Development") +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title= "Development Demand in 2029: Predicted Probabilities",
    subtitle = "Based on population change projections")+
  mapTheme

map_pred29

Mapping predicted development based on projected population change for 2029 doesn’t show any huge changes from where our model predicted development would be in 2019. New development is still predicted to most likely occur in the outer counties. Although, our model is predicting increased development in the northwest and southwest corners of the metro which it was not predicting before we added in the 2029 population projections.


Scenario 2: Supply-side Change Forecast

To begin our supply-side projection, we imagine a new highway in Walton County, in the eastern part of the metro, where development between 2008 and 2019 was less concentrated. This was also a forested area, so the fact that much development had not happened there over the earlier period aroused our curiosity. This way, we can test the extent that a new road investment influences development in a desirable area of land cover. This new road was created as a line segment in ArcGIS and merged with the existing highways shapefile.

#NEW HIGHWAY SEGMENT ONLY
new_highway <- st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/a5a5b0c9099b97f68c0d1aeadcdfe9dbd2960a9a/Data/geojson/new_highway.geojson")

new_highway <- new_highway %>% 
st_transform(st_crs(metro_counties)) %>%
  st_intersection(metro_counties) 

#new_highway <- st_write(new_highway,"C:/Users/Lindsey/Desktop/CPLN675_Land_Use_Modeling_Desktop/Final/LUEM_A5_Repo/Data/geojson/new_highway.geojson")

#ALL HIGHWAYS MERGED TOGETHER -- highways_all --  

highways_all <- st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/a5a5b0c9099b97f68c0d1aeadcdfe9dbd2960a9a/Data/geojson/highways_all.geojson") %>% 
st_transform(st_crs(metro_counties)) %>%
  st_intersection(metro_counties)
ggplot() +
  geom_point(data=dev_change_fishnet, 
             aes(x=xyC(dev_change_fishnet)$x, y=xyC(dev_change_fishnet)$y, colour=development_change)) +
  scale_colour_manual(values = c(gray, "#ED7953FF"),
                      labels=c("No Change","New Development"),
                      name = "") +
  labs(title = "Land Cover Development Change",
       subtitle = "Atlanta metro area | Fishnet centroids with existing highways\nin dark blue and a new highway in cyan") +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  geom_sf(data=AtlantaHighways, color = highlight, linewidth = .5) +
  geom_sf(data=new_highway, colour="cyan", linewidth=1) +
  mapTheme

Now, with the new highways dataset, we can create a new distance-to-highway raster, and from there convert it to fishnet centroids which we merge with the fishnet. This helps us understand how far the fishnet cells are from the new highway.

#new_highway_raster <- 
 #  as(highways_all,'Spatial') %>%
  # rasterize(.,emptyRaster)

#new_highway_raster_distance <- distance(new_highway_raster)
#writeRaster(new_highway_raster_distance, filename=file.path(rast_dir, "new_highway_dist.tif"), format="GTiff", overwrite=TRUE)

new_highway_raster_distance <- raster(paste(dir_git, "187ac7cf231257fc52de0f7f2a1dc2014e4e6f2d/Data/raster/new_highway_dist.tif", sep='/'))

names(new_highway_raster_distance) <- "distance_highways"

new_highwayPoints <-
  rasterToPoints(new_highway_raster_distance) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet))

new_highwayPoints_fishnet <- 
  aggregate(new_highwayPoints, AtlantaMSA_fishnet, mean) %>%
  mutate(distance_highways = ifelse(is.na(distance_highways),0,distance_highways))

ggplot() +
  geom_sf(data=metro_counties) +
  geom_point(data=new_highwayPoints_fishnet, aes(x=xyC(new_highwayPoints_fishnet)[,1], 
                                             y=xyC(new_highwayPoints_fishnet)[,2], 
                 colour=factor(ntile(distance_highways,5))),size=1.5) +
  scale_colour_manual(values = palette5,
                      labels=substr(quintileBreaks(new_highwayPoints_fishnet,"distance_highways"),1,8),
                      name="Quintile\nBreaks") +
  geom_sf(data=highways_all, colour = "black") +
  geom_sf(data=new_highway, colour="cyan", linewidth=.75) +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title = "Distance to Highways",
       subtitle = "As fishnet centroids; Existing highways visualized in black; new highway in cyan") +
  mapTheme

We then use Model 6 to predict how adding a highway might affect development patterns in 2029, based on scenario 1. While this model does not show a true supply-side only effect, it is useful because it allows us to directly compare how adding in a supply-side factor affects the model’s demand-side only predictions.

dat_highway <-
  dat_infill %>%
  dplyr::select(-distance_highways) %>% 
  left_join(as.data.frame(new_highwayPoints_fishnet)) %>% 
  mutate(predict_2029.distance_highways = predict(Model6,. , type="response"))


Map_highway_pred <-
dat_highway %>%
  ggplot() +  
  geom_point(aes(x=xyC(dat_highway)[,1], y=xyC(dat_highway)[,2], colour = factor(ntile(predict_2029.distance_highways,5)))) +
  scale_colour_manual(values = palette5,
                    labels=substr(quintileBreaks(dat_highway,"predict_2029.distance_highways"),1,4),
                    name="% Likelihood\nof Development") +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data=highways_all, colour = "black") +
  geom_sf(data=new_highway, colour="cyan", linewidth=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title= "Development Supply in 2029: Predicted Probabilities",
       subtitle = "Based on projected development response to a new highway") +
  mapTheme

grid.arrange(ncol=1,
             map_pred29,
             Map_highway_pred
             )

While the change is slight, we see that adding in the new highway does slightly increase the predicted likelihood of development in the area around it.

Allocation

Now that we better understand the effects of demand-side factors (like population growth and proximity to developed areas) and supply-side factors (like new infrastructure available land), we can analyze these factors by county. This allows us to better understand which counties in the Atlanta MSA are better suited to development than others. It also allows us to see which land use and development policies might be best suited to this region.

Let’s first orient ourselves to how each kind of land cover was distributed in 2019.

dat2 <-
  aggregateRaster(theRasterList19, dat) %>%
  dplyr::select(developed19, forest19, farm19, wetlands19, otherUndeveloped19, water19) %>%
  st_set_geometry(NULL) %>%
  bind_cols(.,dat) %>%
  st_sf() %>%
  st_cast("POLYGON")

rasters19_map <- aggregatedRasters19 %>%
  gather(var,value,developed19:water19) %>%
  st_cast("POLYGON") %>%    #just to make sure no weird geometries slipped in
  mutate(X = xyC(.)$x,
         Y = xyC(.)$y) %>%
  ggplot() +
    geom_sf(data=metro_counties) +
    geom_point(aes(X,Y, colour=as.factor(value))) +
    facet_wrap(~var) +
    scale_colour_manual(values = c(gray, highlight),
                        labels=c("False","True"),
                        name = "") +
    geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .25) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .5) +
    labs(title = "Land Cover Types, 2019",
         subtitle = "As fishnet centroids") +
   mapTheme +
   theme()  +
   theme(legend.position = "none")

rasters19_map

We see that much of the metro area, including developed areas, is forested - meaning it has a large amount of tree cover. Farms and other undeveloped lands tend to be located more on the outer edges. Wetlands on the other hand are concentrated mostly in the metro region’s southern counties, along with a number of small ponds and rivers. The largest water bodies, however, are located in the north of the metro.

Mapping these land cover types separately allows us to see that some counties may be better suited to development, while others have more sensitive land cover that should be protected.

However, this method does not allow us to pinpoint where these locations are with much specificity.

Sensitive Land Cover Lost

To better understand where ecologically important lands are being lost for development, we create an indicator layer called sensitive_lost19. This new layer shows where forest or wetlands were lost to development between 2009 and 2019.

dat2 <-
  dat2 %>%
   mutate(sensitive_lost19 = ifelse(forest08 == 1 & forest19 == 0 |
                                    wetlands08 == 1 & wetlands19 == 0,1,0)) %>% 
  st_transform("ESRI:102267")
                      
ggplot() +
  geom_point(data=dat2, aes(x=xyC(dat2)[,1], y=xyC(dat2)[,2], colour=as.factor(sensitive_lost19))) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("No Change","Sensitive Lost"),
                      name = "") +
  geom_sf(data = metro_counties, fill = "transparent", color = "white", linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent", color = "black", linewidth = .75) +
  labs(title = "Sensitive lands lost: 2009 - 2019",
       subtitle = "As fishnet centroids") +
  mapTheme

Interestingly, we see that more sensitive areas were lost to development near the metro’s center during this period.

Landscape Fragmentation

To contextualize these sensitive locations, we group areas where the wetlands19 and forest11 rasters are contiguous to create a sensitive_regions layer containing all sensitive regions with an area greater than 1 acre.

sensitiveRegions <- 
  raster::clump(wetlands19 + forest19) %>% #units are meters
  rasterToPolygons() %>%
  st_as_sf() %>%
  group_by(clumps) %>% 
  summarize() %>%
    mutate(Acres = as.numeric(st_area(.) * 0.00024711)) %>% #There are 0.00024711 acres per square meter and 4046.86 square meters in an acre
    filter(Acres > 4046.86)  %>%
  dplyr::select() %>%
  raster::rasterize(.,emptyRaster) 

sensitiveRegions[sensitiveRegions > 0] <- 1  
names(sensitiveRegions) <- "sensitiveRegions"

dat2 <-
  aggregateRaster(c(sensitiveRegions), dat2) %>%
  dplyr::select(sensitiveRegions) %>%
  st_set_geometry(NULL) %>%
  bind_cols(.,dat2) %>%
  st_sf()

ggplot() +
  geom_point(data=dat2, aes(x=xyC(dat2)[,1], y=xyC(dat2)[,2], colour=as.factor(sensitiveRegions))) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("Other","Sensitive Regions"),
                      name="") +
  geom_sf(data = metro_counties, fill = "transparent", color = "white", linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent", color = "black", linewidth = .75) +
  labs(title = "Sensitive regions",
       subtitle = "Continous areas of either wetlands or forests\ngreater than 1 acre") +
  mapTheme

This gives us a much more clear understanding of which areas might be most sensitive to development than mapping each type of land cover separately did. We see that most of the counties along the outer border of the Atlanta metro region still have relatively low landscape fragmentation. This suggests that these counties should prioritize infill development over suburban sprawl.

Summarize by County

To better understand how these characteristics compare across counties, we create a matrix of demand side and suitability factors.

This matrix allows us to better understand each county’s suitability for development.

county_specific_metrics <- 
  dat2 %>%
  #predict development demand from our model
  mutate(Development_Demand = predict(Model6, dat2, type="response")) %>%
  #get a count count of grid cells by county which we can use to calculate rates below
  left_join(st_set_geometry(dat, NULL) %>% group_by(NAME) %>% summarize(count = n())) %>%
  #calculate summary statistics by county
  group_by(NAME) %>%
  summarize(Total_Farmland = sum(farm19) / max(count),
            Total_Forest = sum(forest19) / max(count),
            Total_Wetlands = sum(wetlands19) / max(count),
            Total_Undeveloped = sum(otherUndeveloped19) / max(count),
            Sensitive_Land_Lost = sum(sensitive_lost19) / max(count),
            Sensitive_Regions = sum(sensitiveRegions) / max(count),
            Mean_Development_Demand = mean(Development_Demand)) %>%
  #get population data by county
  left_join(countyPopulation_2029 %>% 
            mutate(Population_Change = county_projection_2029 - county_population_2019,
                   Population_Change_Rate = Population_Change / county_projection_2029) %>%
            dplyr::select(NAME,Population_Change_Rate))
county_specific_metrics %>%
  gather(Variable, Value, -NAME, -geometry) %>%
  mutate(Variable = factor(Variable, levels=c("Population_Change_Rate","Mean_Development_Demand",
                                              "Total_Farmland","Total_Undeveloped","Total_Forest",
                                              "Total_Wetlands","Sensitive_Land_Lost","Sensitive_Regions",
                                              ordered = TRUE))) %>%
  mutate(Planning_Designation = case_when(
    Variable == "Population_Change_Rate" | Variable == "Mean_Development_Demand" ~ "Demand-Side",
    Variable == "Total_Farmland" | Variable == "Total_Undeveloped"               ~ "Suitable",
    TRUE                                                                         ~ "Not Suitable")) %>%
  ggplot(aes(x=Variable, y=Value, fill=Planning_Designation)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black") +
    facet_wrap(~NAME, ncol=5) +
    coord_flip() +
    scale_y_continuous(breaks = seq(.25, 1, by = .25)) +
    geom_vline(xintercept = 2.5) + geom_vline(xintercept = 4.5) +
    scale_fill_manual(values=c("#0D0887FF","#BD3786FF","#FDC926FF")) +
    labs(title= "County Specific Allocation Metrics", subtitle= "As rates", x="Indicator", y="Rate") +
    plotTheme + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom")

This chart shows three types of metrics:
- Demand-side factors
- Factors suitable for development, and
- Factors not suitable for development

The demand side factors are Mean_Development_Demand and Population_Change_Rate. Mean_Development_Demand is the mean value of predicted development demand for a given county in 2029. Population_Change_Rate is the share of a county’s population that is projected to move-in between 2019 and 2029.

Factors suitable for development are area of Total_Farmland and area of Total_Undeveloped land in a given county.

Factors not suitable for development are area of Sensitive_Regions, Sensitive_Land_Lost, and Total_Wetlands in a county.

In determining our allocation criteria, we determined any area with a wetland was not suitable for development. Attending to the externalities of urban sprawl, we chose to preserve these vital ecosystems. When it comes to forests, however, given that the Atlanta MSA has such extensive tree cover, we determined that forested areas are suitable for development.

Looking at the development suitability indicators across the 21 counties of Georgia’s MSA, we see that all counties have a large percentage of forested land and therefore of sensitive regions. Looking at the rest of the indicators together however, we can see that some counties are better suited for development than others.

For example, we see that Hall County has both a high rate of development demand and of population change. It also has a relatively large amount of farmland and a relatively low amount of total wetlands and sensitive land lost. Together, these factors suggest that Hall County has ample space for development that will not impact sensitive areas (other than forest).

The satellite image below of development in Hall County shows why there is a large amount of forested land cover across the Atlanta metro region. There seems to be a regional preference for dispersed development in forested land.

On the other hand, Clayton County appears to be poorly suited for development because it has relatively high development demand, is experiencing population growth, has a low amount of farmland and undeveloped land, and has a relatively high amount of wetlands.

Let’s look more closely at Hall County (in the North) and Clayton County (to the South) to see what these traits look like spatially.

ggplot() +
  geom_sf(data = metro_counties, fill = gray, color = "white", linewidth = .5) +
  geom_sf(data = filter(metro_counties, NAME=="Hall" | NAME=="Clayton"), fill = "#FDC926FF", color = "white", linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent", color = "black", linewidth = .75) +
  labs(title = "Atlanta MSA",
       subtitle = "Hall and Clayton Counties Highlighted") +
  mapTheme

Hall County

First, we map supply and demand factors for Hall County, to examine an example of a county well suited to further development.

HallCounty <-
  dat2 %>%
    mutate(Development_Demand = predict(Model6, dat2, type="response")) %>%
    filter(NAME == "Hall") %>%
    rename("developed19" = developed19...2)

HallCounty_landUse <- rbind(
  filter(HallCounty, wetlands19 == 1 ) %>%
  dplyr::select() %>% mutate(Land_Use = "Not Suitable"),
  filter(HallCounty, developed19 == 1) %>%
  dplyr::select() %>% mutate(Land_Use = "Developed"))

Map_Hall1 <-
ggplot() +
  geom_sf(data=HallCounty, aes(fill=factor(ntile(Development_Demand,5))), colour=NA) +
  geom_point(data=HallCounty_landUse, aes(x=xyC(HallCounty_landUse)[,1], 
                                        y=xyC(HallCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 3) +
  geom_sf(data=st_intersection(AtlantaHighways,filter(metro_counties, NAME=="Hall")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Development\nDemand",
                    labels=substr(quintileBreaks(HallCounty,"Development_Demand"),1,5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Hall"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Development Potential, 2029: Hall County",
       subtitle = "With Developed and Unsuitable Areas from 2019") +
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2))

Map_Hall2 <-
ggplot() +
  geom_sf(data=HallCounty, aes(fill=factor(ntile(pop_Change,5))), colour=NA) +
  geom_point(data=HallCounty_landUse, aes(x=xyC(HallCounty_landUse)[,1], 
                                        y=xyC(HallCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 3) +
  geom_sf(data=st_intersection(AtlantaHighways, filter(metro_counties, NAME=="Hall")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Population\nChange",
                    labels=substr(quintileBreaks(HallCounty,"pop_Change"),1,5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Hall"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Projected Population, 2029: Hall County",
       subtitle = "With Developed and Unsuitable Areas from 2019") +
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2))

grid.arrange(
  Map_Hall1,
  Map_Hall2,
  ncol=2)

Mapping Hall County’s development potential and projected population with developed land and wetlands (the “not suitable” land) shows us that the County still has a significant amount of area that is suitable for development. In particular, this county would do well to prioritize infill development between highways in its northwest and southern portions where there is predicted to be both high development demand and significant population growth. This would also help to reduce forest fragmentation and pressure on wetlands in the county’s east and northeast.

Clayton County

Next, we map the same supply and demand factors for Clayton County to examine an example of a county not well suited to further development.

ClaytonCounty <-
  dat2 %>%
    mutate(Development_Demand = predict(Model6, dat2, type="response")) %>%
    filter(NAME == "Clayton") %>%
    rename("developed19" = developed19...2)

ClaytonCounty_landUse <- rbind(
  filter(ClaytonCounty, wetlands19 == 1 ) %>%
  dplyr::select() %>% mutate(Land_Use = "Not Suitable"),
  filter(ClaytonCounty, developed19 == 1) %>%
  dplyr::select() %>% mutate(Land_Use = "Developed"))

grid.arrange(
ggplot() +
  geom_sf(data=ClaytonCounty, aes(fill=factor(ntile(Development_Demand,5))), colour=NA) +
  geom_point(data=ClaytonCounty_landUse, aes(x=xyC(ClaytonCounty_landUse)[,1], 
                                        y=xyC(ClaytonCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 6) +
  geom_sf(data=st_intersection(AtlantaHighways,filter(metro_counties, NAME=="Clayton")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Development\nDemand",
                    labels=substr(quintileBreaks(ClaytonCounty,"Development_Demand"),1,5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Clayton"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Development Potential, 2029: Clayton County",
       subtitle = "With Developed and Unsuitable Areas from 2019") + 
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2)),

ggplot() +
  geom_sf(data=ClaytonCounty, aes(fill=factor(ntile(pop_Change,5))), colour=NA) +
  geom_point(data=ClaytonCounty_landUse, aes(x=xyC(ClaytonCounty_landUse)[,1], 
                                        y=xyC(ClaytonCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 6) +
  geom_sf(data=st_intersection(AtlantaHighways, filter(metro_counties, NAME=="Clayton")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Population\nChange",
                    labels=substr(quintileBreaks(ClaytonCounty, "pop_Change"), 1, 5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Clayton"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Projected Population, 2029: Clayton County",
      subtitle = "With Developed and Unsuitable Areas from 2019") + 
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2)), ncol=2)

In contrast to Hall County’s relatively large amount of land that is suitable for development, we see that Clayton County has almost none. The county is almost entirely developed and the two areas in the southern portion of the county that are not developed are dominated by wetland areas that are not suitable for development.

Furthermore, the map shows that the remaining cells are all near wetland areas or not expected to see population growth. Therefore, Clayton County is an ideal location for conservation rather than development in the Atlanta metro area.

Conclusion

Thank you for walking through our model. We conclude by summarizing the strengths, weaknesses, and planning implications of this approach.

Strengths of approach
While not perfect, the strengths of this approach are that it allows us to estimate future urban sprawl based on land cover and population characteristics of a given area. Being able to visually analyze development potential through maps and charts has great utility for communicating the complex nature of urban sprawl to everyone from policymakers to community members with non-technical backgrounds.

Weaknesses
While an incredibly useful tool, because this approach only takes inputs from two study years, it has the potential to predict future development based on a small historical sample and anomalous trends. Furthermore, while the maps and charts are helpful for communication, they are estimations at best, and only one piece of the land allocation toolkit. The convincing nature of these graphics risks communicating these estimates as fact. Finally, the model is only as good as the variables within it. In future studies, we could improve our model by testing additional predictor variables such as school districts, demographic information, and real estate values.

Policy implications
Based on the results of our analysis, the 21 county Atlanta metro region would benefit from conservation easements to prevent further landscape fragmentation. This would protect the vitality of forest ecosystems over the long-term. To calibrate against further landscape fragmentation, this analysis suggests that policymakers could implement an urban growth boundary in the fringe counties to protect the metro’s remaining sensitive regions before they are fully developed. This would have the added benefit of encouraging infill development in the MSA.

---
title: "LUEM A5 - Forecasting Sprawl"
author: "Lindsey Hover & Charlie Townsley"
date: "`r Sys.Date()`"
output: 
  html_document:
    toc: true
    toc_depth: 3
    toc_float: true
    theme: flatly
    highlight: breezedark
    code_folding: hide
    code_download: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rm(list=ls())

options(scipen = 999) #turn off scientific notation
```

```{r libraries and themes}
#install.packages("FNN")
#install.packages("ggrepel")
#install.packages("yardstick")
#install.packages("igraph")
#install.packages("caret")
#update.packages("caret")
#install.packages("reactable")

library(tidyverse)
library(sf)
library(raster)
library(knitr)
library(kableExtra)
library(tidycensus)
library(tigris)
library(FNN)
library(caret)
library(yardstick)
library(pscl)
library(plotROC) 
library(ggrepel)
library(pROC)
library(grid)
library(gridExtra)
library(viridis)
library(igraph)
library(maptools)
library(reactable)
library(classInt)

plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  # Set the entire chart region to blank
  panel.background=element_blank(),
  plot.background=element_blank(),
  #panel.border=element_rect(colour="#F0F0F0"),
  # Format the grid
  panel.grid.major=element_line(colour="#D0D0D0",size=.75),
  axis.ticks=element_blank())

mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))

#Colors: Viridis Plasma (option "C")
gray <- "#D3D3D3"
darkGray <- "#505251"
highlight <- "#0D0887FF"
palette2 <- c("#FA9E3BFF", "#0D0887FF")
palette3 <- c("#FDC926FF", "#D8576BFF", "#0D0887FF")
palette4 <- c("#FA9E3BFF", "#D8576BFF", "#9C179EFF", "#0D0887FF")
palette5 <- c("#FDC926FF", "#ED7953FF", "#D8576BFF", "#9C179EFF", "#0D0887FF")
palette10 <- c("#F0F921FF", "#FDC926FF", "#FA9E3BFF", "#ED7953FF", "#D8576BFF",
               "#BD3786FF", "#9C179EFF", "#7301A8FF", "#47039FFF", "#0D0887FF")

#R Markdown Cheat sheet for reference - https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf
```

```{r directories, include=FALSE}
dir_git <- "https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo"

dir_ct <- 'C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data'
```

```{r functions}
#this function converts a column in to quintiles. It is used for mapping.
quintileBreaks <- function(df,variable) {
    as.character(quantile(df[[variable]],
                          c(.01,.2,.4,.6,.8),na.rm=T))
}

#This function can be used to convert a polygon sf to centroids xy coords.
xyC <- function(aPolygonSF) {
  as.data.frame(
    cbind(x=st_coordinates(st_centroid(aPolygonSF))[,1],
          y=st_coordinates(st_centroid(aPolygonSF))[,2]))
} 

#this function convert a raster to a data frame so it can be plotted in ggplot
rast <- function(inRaster) {
  data.frame(
    xyFromCell(inRaster, 1:ncell(inRaster)), 
    value = getValues(inRaster)) }

#this function aggregates rasters
aggregateRaster <- function(inputRasterList, AtlantaMSA_fishnet) {
  #create an empty fishnet with the same dimensions as the input fishnet
  theseFishnets <- AtlantaMSA_fishnet %>% dplyr::select()
  #for each raster in the raster list
  for (i in inputRasterList) {
  #create a variable name corresponding to the ith raster
  varName <- names(i)
  #convert raster to points as an sf
    thesePoints <-
      rasterToPoints(i) %>%
      as.data.frame() %>%
      st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet)) %>%
      filter(.[[1]] == 1)
  #aggregate to the fishnet
    thisFishnet <-
      aggregate(thesePoints, AtlantaMSA_fishnet, length) %>%
      mutate(!!varName := ifelse(is.na(.[[1]]),0,1))
  #add to the larger fishnet
    theseFishnets <- cbind(theseFishnets, thisFishnet)
  }
  #output all aggregates as one large fishnet
   return(theseFishnets)
}
```

# Introduction: This Model and Its Significance to Planners

Welcome! In this interactive R-markdown memo, we employ historical patterns of development and population trends in the Atlanta, Georgia metropolitan statistical area ("MSA") to predict future development scenarios. Specifically, we examine the spatial relationships that exist among population growth and highway infrastructure with land development. Through these models, we explore the relationships between each of these independent variables and urban sprawl, our dependent variable.

Urban sprawl is a development scenario characterized by "leap frogging" development, a development scenario that disproportionately consumes land and environmental resources, with significant negative ecological externalities. These models will predict likely urban sprawl for the Atlanta metro area based on two scenarios. The models presented here enable planners and policy makers to preemptively develop policies to calibrate against urban sprawl, empowering municipal governments to create conscientious land allocation plans.

The model developed will predict two different scenarios: demand-side and supply-side. In the demand-side model, we attempt to understand the relationship that population growth has with urban sprawl in the MSA. Specifically, we predict the demand for development according to population growth, and how this would spatially manifest based on existing land use trends and population projections. The supply-side model will predict how future development might occur following the construction of a new, theoretical highway, supplying infrastructure. Through this scenario, we attempt to ascertain the relationship that highway development has with urban sprawl.

Let's explore how sprawl can occur, unmitigated, based on these independent variables. We use land cover and population data from the years 2008/2009 and 2019 to predict for the year 2029.

-   Region of interest: Atlanta, GA metro

-   Years of study: 2008-2019

-   Projecting to 2029

-   Data level: fishnet grid cell

The following chunks of code will collect the prerequisite data needed for the model from the two years of study, structure the data in preparation for modeling, and run the demand - and supply- side models based on the study years' data to project for the year 2030.

# Data Gathering and Feature Engineering

First, we pull in and visualize our spatial datasets of the Atlanta MSA's 21 counties.

```{r creating GeoJSON from Shapefile, include=FALSE, eval=FALSE}
#SHP1 <- st_read("C:/Users/Lindsey/Desktop/CPLN675_Land_Use_Modeling_Desktop/Final/LUEM_A5_Repo/Shapefiles/SHP1.shp")

#GeoJSON1 <- st_write(SHP1, "C:/Users/Lindsey/Desktop/CPLN675_Land_Use_Modeling_Desktop/Final/LUEM_A5_Repo/GeoJSONs/SHP1.geojson")

#GeoJSON_x <- st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/d9859fc1069dd89d44fcaf345e722a9bf601a550/Data/geojson/GeoJSON_x.geojson") ### specify the file name (note: this is the permalink, but you change the domain to raw.githubusercontent.com AND you remove /blob/)


#Atlanta_metro_bounday <- st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/d9859fc1069dd89d44fcaf345e722a9bf601a550/Data/geojson/Atlanta_Metro_Boundary.geojson")

```

And retrieve the Atlanta MSA boundary using the `tigris` package.

```{r Atlanta MSA shapefile, results='hide'}
#List of the Atlanta region MPOs 21 counties
Counties <- c("Barrow", "Bartow", "Carroll", "Cherokee", 
              "Clayton", "Cobb", "Coweta", "Dawson", 
              "DeKalb", "Douglas", "Fayette", "Forsyth", 
              "Fulton", "Gwinnett", "Hall", "Henry", 
              "Newton", "Paulding", "Rockdale", "Spalding", "Walton")

#First pull all MSAs in the U.S., then search for yours and filter with the correct CBSAFP code
metro_counties <- counties(state = "GA", cb = TRUE, resolution = "500k", year = NULL) %>%
  filter(NAME %in% Counties) %>% 
  st_transform("ESRI:102267")

#Merge counties into one shapefile showing just the boundary
Atlanta_MSA <- st_union(metro_counties) %>% 
  st_transform("ESRI:102267")
```

```{r plot msa}
#plot MSA boundary to make sure its right
ggplot() + 
  geom_sf(data = metro_counties) +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  mapTheme + 
  labs(title = "Atlanta 21 County MSA")
```


```{r msa conversion, include=FALSE, eval=FALSE}
## SKIP ##
#Convert shapefile to geojson so can store on github
#st_write(metro_counties, append = FALSE, "C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/geojson/Atlanta_Metro_Counties.geojson")

#st_write(metro_counties, append = FALSE, "C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/shapefile/Atlanta_Metro_Boundary/Atlanta_Metro_Counties.shp")

#st_write(metro_counties, append = FALSE, "C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/geojson/AtlantaMSA_Boundary_21counties.geojson")

#st_write(metro_counties, append = FALSE, "C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/shapefile/Atlanta_Metro_Boundary_21county/AtlantaMSA_Boundary_21counties.shp")
```

We then create a fishnet of the Atlanta MSA using 1000' x 1000' cells. A fishnet is a vector data set that will allow us to attach other information to the forthcoming raster data set on land cover.

```{r fishnet}
AtlantaMSA_fishnet <- 
    st_make_grid(metro_counties, cellsize = 1000, square = TRUE) %>%
  st_sf() %>% 
  st_transform("ESRI:102267")

AtlantaMSA_fishnet <-
  AtlantaMSA_fishnet[metro_counties,]

#plot MSA boundary to make sure its right
ggplot()+
  geom_sf(data = AtlantaMSA_fishnet,
          fill = "lightgrey") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title = "Atlanta MSA Fishnet") +
  mapTheme
```

## Land Cover

### Land Cover by Year and Type

We retrieve the raster files of land cover for the Atlanta MSA. We downloaded these datasets from the Multi-Resolution Land Characteristics Consortium's National Land Cover Database, and clipped them in ArcGIS to a rectangular boundary extent that aligned with the farthest points of the Atlanta region's 21 county border.

```{r Import full original Land Cover data, eval=FALSE}
##SKIP##
#load full size rasters
lc_2008 = raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/56bd35b343d39e88bd8e1fdc89957e6ccb804fe9/Data/raster/2008LC_proj.tif")
    
lc_2019 = raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/56bd35b343d39e88bd8e1fdc89957e6ccb804fe9/Data/raster/2019LC_proj.tif")
```

The raster files were still very large, so we down-sampled them to make their size more manageable and then clipped them precisely to the MSA boundary.

```{r downsample rasters crop and export, eval=FALSE}
## SKIP ##

#down sample the raster to make more manageable
lc_2008 <- aggregate(lc_2008, fact = 7, fun = modal)
lc_2019 <- aggregate(lc_2019, fact = 7, fun = modal)

#crop rasters
lc_2008_crop <- crop(lc_2008, extent(metro_counties))
lc_2019_crop <- crop(lc_2019, extent(metro_counties)) 

#mask rasters
lc_2008_crop <- mask(x = lc_2008_crop, mask = metro_counties)
lc_2019_crop <- mask(x = lc_2019_crop, mask = metro_counties)

#Export downsized rasters
#writeRaster(lc_2008_crop, filename=file.path(rast_dir, "lc2008_sml.tif"), format="GTiff", overwrite=TRUE)

#writeRaster(lc_2019_crop, filename=file.path("C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/raster", "lc2019_sml.tif"), format="GTiff", overwrite=TRUE)
```

```{r import downsampled rasters}
lc_2008 <- raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/24f08e61c5ec983b3fa39f1f3b03fb90aa3a13ee/Data/raster/lc2008_sml.tif") 

lc_2019 <- raster("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/24f08e61c5ec983b3fa39f1f3b03fb90aa3a13ee/Data/raster/lc2019_sml.tif") 
```

This allows us to visualize Atlanta's land cover in 2008 and 2019.

```{r Plot land cover for 2008 and 2019, fig.height=10, fig.width=8}
grid.arrange(ncol = 1,
ggplot() +
  geom_raster(data=rast(lc_2008) %>% na.omit %>% filter(value > 0), 
              aes(x,y,fill=as.factor(value))) +
  scale_fill_viridis(discrete=TRUE,
                     name ="",
                     option = "C") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Land Cover, 2008") +
  mapTheme +
  theme(legend.direction="horizontal"),


ggplot() +
  geom_raster(data=rast(lc_2019) %>% na.omit %>% filter(value > 0), 
              aes(x, y, fill=as.factor(value))) +
  scale_fill_viridis(discrete=TRUE,
                     name ="",
                     option = "C") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
      geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Land Cover, 2019") +
  mapTheme +
  theme(legend.direction="horizontal")
)
```

Viewing land cover at the MSA level in 2008 and 2019 shows that change is happening at too small a scale to discern visually at this extent. Therefore, we summon the powers of R. First, we distill the many specific land cover classifications into six simple buckets: developed land, forest, farmland, wetlands, water bodies, and other undeveloped lands.

### Land Cover by Type

```{r reclass Land Cover}
developed08 <- lc_2008 == 21 | lc_2008 == 22 | lc_2008 == 23 | lc_2008 == 24
forest08 <- lc_2008 == 41 | lc_2008 == 42 | lc_2008 == 43 
farm08 <- lc_2008 == 81 | lc_2008 == 82 
wetlands08 <- lc_2008 == 90 | lc_2008 == 95 
otherUndeveloped08 <- lc_2008 == 52 | lc_2008 == 71 | lc_2008 == 31 
water08 <- lc_2008 == 11

developed19 <- lc_2019 == 21 | lc_2019 == 22 | lc_2019 == 23 | lc_2019 == 24
forest19 <- lc_2019 == 41 | lc_2019 == 42 | lc_2019 == 43 
farm19 <- lc_2019 == 81 | lc_2019 == 82 
wetlands19 <- lc_2019 == 90 | lc_2019 == 95 
otherUndeveloped19 <- lc_2019 == 52 | lc_2019 == 71 | lc_2019 == 31 
water19 <- lc_2019 == 11

names(developed08) <- "developed08"
names(forest08) <- "forest08"
names(farm08) <- "farm08"
names(wetlands08) <- "wetlands08"
names(otherUndeveloped08) <- "otherUndeveloped08"
names(water08) <- "water08"

names(developed19) <- "developed19"
names(forest19) <- "forest19"
names(farm19) <- "farm19"
names(wetlands19) <- "wetlands19"
names(otherUndeveloped19) <- "otherUndeveloped19"
names(water19) <- "water19"
```

This distilled, categorical dataset for each year is then aggregated to our fishnet of the Atlanta MSA. The resulting maps show that Atlanta is a heavily forested area. Development is vast, with its anchor in the center of the MSA, radiating outward. A significant amount of farmland exists in a thick band against the MSA's periphery. Undeveloped areas are scattered, also mostly near the MSA's periphery.

```{r turn rasters into fishnet dataframe, fig.width=12, fig.height=10}

theRasterList08 <- c(developed08, forest08, farm08, wetlands08, otherUndeveloped08, water08)

theRasterList19 <- c(developed19, forest19, farm19, wetlands19, otherUndeveloped19, water19)

aggregatedRasters08 <-
  aggregateRaster(theRasterList08, AtlantaMSA_fishnet) %>%
  dplyr::select(developed08, forest08, farm08, wetlands08, otherUndeveloped08, water08) %>%
  mutate_if(is.numeric, as.factor)

aggregatedRasters19 <-
  aggregateRaster(theRasterList19, AtlantaMSA_fishnet) %>%
  dplyr::select(developed19, forest19, farm19, wetlands19, otherUndeveloped19, water19) %>%
  mutate_if(is.numeric, as.factor)

rasters08_map <- aggregatedRasters08 %>%
  gather(var,value,developed08:water08) %>%
  st_cast("POLYGON") %>%    #just to make sure no weird geometries slipped in
  mutate(X = xyC(.)$x,
         Y = xyC(.)$y) %>%
  ggplot() +
    geom_sf(data=metro_counties) +
    geom_point(aes(X,Y, colour=as.factor(value))) +
    facet_wrap(~var) +
    scale_colour_manual(values = c(gray, highlight),
                        labels=c("False","True"),
                        name = "") +
    geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .25) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .5) +
    labs(title = "Land Cover Types, 2008",
         subtitle = "As fishnet centroids") +
   mapTheme +
   theme()  +
   theme(legend.position = "none")

rasters08_map
```

### Land Cover Change

To identify areas with land cover change, we use the raster data sets and reclassify the land cover types as numerical values, where developed lands have the value of 1 and undeveloped lands have the value of 0, for each of the years' data sets. We are then able to add these data sets together on a cell-by-cell basis to understand where development has occurred in 2019 and where it was not in 2008. When added together, the cells with values of 1 represent where new development has occurred; a value of 2 shows where development was in 2008 and remained in 2019, and a value of 0 shows land that was and remains undeveloped. This function is like using raster calculator in GIS.

```{r reclass matrix, results='hide'}
#creat matrix for reclassifying land cover
reclassMatrix <- 
  matrix(c(
    0,12,0,
    12,24,1,
    24,Inf,0),
  ncol=3, byrow=T)

reclassMatrix
```

```{r developed land cover change plot}
#reclassify land cover change using reclass matrix

lc_2008r <- 
  reclassify(lc_2008, reclassMatrix)
lc_2019r <- 
  reclassify(lc_2019, reclassMatrix)

lc_2008r[lc_2008r < 1] <- 0
names(lc_2008r) <- "dev08"

lc_2019r[lc_2019r < 1] <- 0
names(lc_2019r) <- "dev19"

development_change <- lc_2008r + lc_2019r

# convert the raster to a vector
dev_change_vec <- as.vector(development_change)

# create the histogram plot with ggplot2
ggplot(data.frame(value = dev_change_vec), aes(x=value)) + 
  geom_histogram(binwidth=.5, fill=highlight, color="white") +
  labs(title="Development Change by Type (2008-2019)",
       subtitle = "0 = Remained Undeveloped, 1 = New Development, 2 = Already Developed",
       x="Development Change",
       y="Frequency") +
  plotTheme
```

This shows us that most cells in the MSA remained undeveloped from 2008 to 2019, many of them were already developed, and a few of the previously undeveloped cells converted to developed. We can then visualize where the new development cells with the 1 value are, which shows that new development has been scattered throughout the MSA.

```{r plotting land cover change, fig.width=8}

development_change[development_change != 1] <- NA

ggplot() +
  geom_sf(data=Atlanta_MSA) +
  geom_raster(data=rast(development_change) %>% na.omit, 
              aes(x,y,fill=as.factor(value))) +
  scale_fill_viridis(discrete=TRUE,
                     option = "C",
                     name ="Cells \nwith Land Cover\nChange") + 
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title="Development Land Cover Change (2008-2019)",
       subtitle = "Atlanta metro area | Raster cells") +
  mapTheme +
   theme(legend.position = "none")
  
```

We then join this dataset, mapping new development, to our fishnet. This has the effect of magnifying where the development change has happened a bit.

```{r land cover development change, fig.width=8}
dev_change <- reclassify(development_change, reclassMatrix)
                         
dev_change[development_change < 1] <- NA

names(dev_change) <- "development_change"


changePoints <-
  rasterToPoints(dev_change) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet))

dev_change_fishnet <- 
  aggregate(changePoints, AtlantaMSA_fishnet, sum) %>%
  mutate(development_change = ifelse(is.na(development_change),0,1),
         development_change = as.factor(development_change))

ggplot() +
  geom_point(data=dev_change_fishnet, 
             aes(x=xyC(dev_change_fishnet)$x, y=xyC(dev_change_fishnet)$y, colour=development_change)) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("No Change","New Development"),
                      name = "") +
    geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "New Development (2019)", subtitle = "Atlanta metro area | Fishnet centroids") +
  mapTheme

#dev_change_fishnet <- st_write(dev_change_fishnet, "C:/Users/Lindsey/Desktop/CPLN675_Land_Use_Modeling_Desktop/Final/LUEM_A5_Repo/Data/geojson/dev_change_fishnet.geojson")

```

## Census Data: Population

Next, we set up our MSA population data, which will prepare us for the demand-side model. The next two chunks download the population data for 2009 and 2019 from the American Community Survey 5-year Estimates on a census tract level.

```{r metro pop 2009, echo=FALSE, results='hide'}
# Look here for variable codes
acs_vars <- load_variables(2009, "acs5")

#Pull 2009 population
Pop09 <- 
  get_acs(geography = "tract", variables = "B01003_001", year = 2009,
                state = 13, geometry = TRUE, 
                county = Counties) %>%
  rename("pop09" = estimate) %>% 
  st_transform(st_crs(AtlantaMSA_fishnet))
```

```{r metro pop 2019, echo=FALSE, results='hide'}
#Pull 2019 population
Pop19 <- 
  get_acs(geography = "tract", variables = "B01003_001", year = 2019,
                state = 13, geometry = TRUE, 
                county = Counties) %>%
  rename("pop19" = estimate) %>% 
  st_transform(st_crs(AtlantaMSA_fishnet)) %>% 
  st_buffer(-1) #buffer the the tracts by -1ft. This is done because tidycensus appears to return geometries that are problematic when subjected to the area weighted interpolation function
```

We then visualize the population for the two years by the respective census tract boundaries for each of the years.

```{r plot both pop years, fig.width=12}
grid.arrange(
ggplot() +
  geom_sf(data = Pop09, aes(fill=factor(ntile(pop09,5))), colour=NA) +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  scale_fill_manual(values = palette5,
                    labels=quintileBreaks(Pop09,"pop09"),
                   name="Quintile\nBreaks") +
  labs(title="2009 Population\nAtlanta 21-County MSA",
       subtitle="By Census Tract") +
  mapTheme,

ggplot() +
  geom_sf(data = Pop19, aes(fill=factor(ntile(pop19,5))), colour=NA) +
  scale_fill_manual(values = palette5,
                    labels=quintileBreaks(Pop19,"pop19"),
                    name="Quintile\nBreaks") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title="2019 Population\nAtlanta 21-County MSA",
       subtitle="By Census Tract") +
  mapTheme, ncol=2)
```

We then reconcile the tract boundaries with the fishnet grid cells to be able to conduct analysis and build models with our fishnet data set.

```{r reconcile tracts and fishnet cells}
### Breaking here on select(fishnetID)
AtlantaMSA_fishnet <-
  AtlantaMSA_fishnet %>%
  rownames_to_column("fishnetID") %>% 
  mutate(fishnetID = as.numeric(fishnetID)) %>%
  dplyr::select(fishnetID)

fishnetPop09 <-
  st_interpolate_aw(Pop09["pop09"], AtlantaMSA_fishnet, extensive=TRUE) %>%
  as.data.frame(.) %>%
  rownames_to_column(var = "fishnetID") %>%
  left_join(AtlantaMSA_fishnet %>%
              mutate(fishnetID = as.character(fishnetID)),
            ., by=c("fishnetID"='fishnetID')) %>% 
  mutate(pop09 = replace_na(pop09,0)) %>%
  dplyr::select(pop09)

fishnetPop19 <-
  st_interpolate_aw(Pop19["pop19"], AtlantaMSA_fishnet, extensive=TRUE) %>%
  as.data.frame(.) %>%
  rownames_to_column(var = "fishnetID") %>%
  left_join(AtlantaMSA_fishnet %>%
              mutate(fishnetID = as.character(fishnetID)),
            ., by=c("fishnetID"='fishnetID')) %>% 
  mutate(pop19 = replace_na(pop19,0)) %>%
  dplyr::select(pop19)

fishnetPop <- 
  cbind(fishnetPop09, fishnetPop19) %>%
  dplyr::select(pop09,pop19) %>%
  mutate(pop_Change = pop19 - pop09)
```

We can then compare population by tract vs population by grid cell.

```{r plot pop tract and grid cell, fig.width=12}
grid.arrange(
ggplot() +
  geom_sf(data=Pop19, aes(fill=factor(ntile(pop19,5))),colour=NA) +
  scale_fill_manual(values = palette5,
                    labels=substr(quintileBreaks(Pop19,"pop19"),1,4),
                   name="Quintile\nBreaks") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title="2019 Population\nAtlanta 21-County MSA",
       subtitle="Represented as tracts; Boundaries omitted") +
  mapTheme,

ggplot() +
  geom_sf(data=fishnetPop, aes(fill=factor(ntile(pop19,5))), colour=NA) +
  scale_fill_manual(values = palette5,
                   labels=substr(quintileBreaks(fishnetPop,"pop19"),1,4),
                   name="Quintile\nBreaks") +
  geom_sf(data = Atlanta_MSA, 
          color = "black", fill = "transparent", linewidth = .75) +
  labs(title="2019 Population\nAtlanta 21-County MSA",
       subtitle="Represented as fishnet gridcells; Boundaries omitted") +
  mapTheme, ncol=2)
```

## Highway Distance

Next, we prepare our supply-side model by loading in spatial data for Atlanta's existing highways.

```{r highway to geojson, include=FALSE, eval=FALSE}
#Skip
#metro_highways <- st_read("C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/shapefile/AtlantaMetro_Major_Roads/Major_Roads.shp")

#st_write(metro_highways, append = FALSE, "C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/LUEM_A5_Repo/Data/geojson/Major_Roads.geojson")
```

```{r highways, fig.width=8, echo=FALSE, message=FALSE}
AtlantaHighways <-
  st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/8b689651286bb4ff28554095bd39ce1bd87655e4/Data/geojson/Major_Roads.geojson") %>%
  st_transform(st_crs(metro_counties)) %>%
  st_intersection(metro_counties) %>% 
  filter(FEATURE_TY == "State Highway")

emptyRaster <- development_change
emptyRaster[] <- NA

ggplot() +
  geom_point(data=dev_change_fishnet, 
             aes(x=xyC(dev_change_fishnet)$x, y=xyC(dev_change_fishnet)$y, colour=development_change)) +
  scale_colour_manual(values = c(gray, "#ED7953FF"),
                      labels=c("No Change","New Development"),
                      name = "") +
  geom_sf(data=AtlantaHighways, color = highlight, linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title = "Highways and New Development") +
  mapTheme
```

This suggests there may be an association between proximity to highways and new development. To better understand this relationship, we calculate how far each fishnet cell is from a highway. We do this by converting the highway layer to raster, converting the raster to points, and then calculating the mean distance from a highway for each grid cell.

```{r convert highways to raster, eval=FALSE}
#SKIP#
#create a raster layer of highways
highway_raster <- 
  as(AtlantaHighways,'Spatial') %>%
  rasterize(.,emptyRaster)

#Export rasters

rast_dir = paste(dir_ct, 'raster', sep='/')

#writeRaster(highway_raster, filename=file.path(rast_dir, "highway_raster.tif"), format="GTiff", overwrite=TRUE)
```

This visualizes each cell's distance from a highway.

```{r dist from highways, echo=FALSE, fig.width=8}
#import highway raster
highway_raster <- raster(paste(dir_git, "a5a5b0c9099b97f68c0d1aeadcdfe9dbd2960a9a/Data/raster/highway_raster.tif", sep='/'))

highway_raster_distance <- distance(highway_raster)
names(highway_raster_distance) <- "distance_highways"

#Export highway raster distance than import file from github to speed processing
#highway_raster <- raster(paste(dir_git, "a5a5b0c9099b97f68c0d1aeadcdfe9dbd2960a9a/Data/raster/highway_raster.tif", sep='/'))

highwayPoints <-
  rasterToPoints(highway_raster_distance) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet))

highwayPoints_fishnet <- 
  aggregate(highwayPoints, AtlantaMSA_fishnet, mean) %>%
  mutate(distance_highways = ifelse(is.na(distance_highways),0,distance_highways))

ggplot() +
  geom_sf(data=metro_counties) +
  geom_point(data=highwayPoints_fishnet, aes(x=xyC(highwayPoints_fishnet)[,1], 
                                             y=xyC(highwayPoints_fishnet)[,2], 
                 colour=factor(ntile(distance_highways,5))),size=1.5) +
  scale_colour_manual(values = palette5,
                      labels=substr(quintileBreaks(highwayPoints_fishnet,"distance_highways"),1,8),
                      name="Quintile\nBreaks") +
  geom_sf(data=AtlantaHighways, colour = "black") +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Distance to Highways",
       subtitle = "In Feet; Highways visualized in black") +
  mapTheme
```

## Calculating the Spatial Lag of Development

We measure accessibility by way of a spatial lag, hypothesizing that new development is a function of distance to existing development. The shorter the distance, the more accessible a grid cell is to existing development. We measure this by calculating the average distance from each grid cell to its 2 nearest developed neighboring grid cells in 2009.

```{r nn function}
nn_function <- function(measureFrom,measureTo,k) {
  #convert the sf layers to matrices
  measureFrom_Matrix <-
    as.matrix(measureFrom)
  measureTo_Matrix <-
    as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo, measureFrom, k)$nn.dist
    output <-
    as.data.frame(nn) %>%
    rownames_to_column(var = "thisPoint") %>%
    gather(points, point_distance, V1:ncol(.)) %>%
    arrange(as.numeric(thisPoint)) %>%
    group_by(thisPoint) %>%
    summarize(pointDistance = mean(point_distance)) %>%
    arrange(as.numeric(thisPoint)) %>% 
    dplyr::select(-thisPoint) %>%
    pull()
  
  return(output)  
}
```

```{r development spatial lag, fig.width=8}
AtlantaMSA_fishnet$lagDevelopment08 <-
    nn_function(xyC(AtlantaMSA_fishnet),
                xyC(filter(aggregatedRasters08, developed08==1)),
                2)

# compute natural breaks for "lagDevelopment08"
breaks <- classIntervals(AtlantaMSA_fishnet$lagDevelopment08, n=4, style="jenks")

#Plot
ggplot() +
  geom_sf(data=metro_counties) +
  geom_point(data=AtlantaMSA_fishnet, 
             aes(x=xyC(AtlantaMSA_fishnet)[,1], y=xyC(AtlantaMSA_fishnet)[,2], 
                 colour=cut(lagDevelopment08, breaks$brks)), size=1.5) +
  scale_colour_manual(values=palette5,
                      labels=format(breaks$brks, nsmall=1),
                      name="Distance (ft)") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title = "Spatial Lag to 2008 Development",
       subtitle = "As fishnet centroids") +
  mapTheme
```

This shows most of the area in the MSA's inner counties was near existing development in 2008. It was only the counties at the outer fringes that still had a few areas relatively distant from development.

## Create Final Dataset

We then aggregate all these data to a final fishnet data set for feeding into our model. This fishnet includes columns which indicate, cell-by-cell, if there has been development change, land use type, the cell's distance to highways, the cell's distance to existing development, and its population in both years.

```{r combine all variables into one dataset}
dat <-  
  cbind(
    AtlantaMSA_fishnet, dev_change_fishnet, highwayPoints_fishnet, fishnetPop, aggregatedRasters08) %>%
  dplyr::select(development_change, developed08, forest08, farm08, wetlands08, otherUndeveloped08, water08,
                pop09, pop19, pop_Change, distance_highways, lagDevelopment08) %>%
  st_join(metro_counties) %>%
  mutate(developed19 = ifelse(development_change  == 1 & developed08 == 1, 0, 1)) %>% #charlie to confirm if the else value should be 1 or 'developed'
  filter(water08 == 0) 
```

# Data Exploration

Now that all of our data are assembled, we explore how each factor uncovered thus far is related to new development. This plot displays distance to highways and the spatial lag of development as continuous variables. This shows us that new development has been, on average, closer to highways, and closer to existing development in aggregate, than cells that had no change in land cover. This supports our previous hypothesis that new development might be occurring closer to highways and to previously developed areas.

```{r plotting the averages (means) of infrastructure/development}

dat %>%
  dplyr::select(distance_highways,lagDevelopment08,development_change) %>% 
  gather(Variable, Value, -development_change, -geometry) %>%
  ggplot(., aes(development_change, Value, fill=development_change)) + 
    geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
    facet_wrap(~Variable) +
    scale_fill_manual(values = palette2,
                      labels=c("No Change","New Development"),
                      name="") +
    labs(title="New Development as a Function of the Continuous Variables") +
    plotTheme

```

This next plot shows that new development has, on average, occurred in cells where the population is greater or growing. From this, we learn that development in Atlanta MSA is also positively correlated with the amount of population and population growth in an area.

```{r bar plots of population variables}

dat %>%
  dplyr::select(pop09,pop19,pop_Change,development_change) %>%
  gather(Variable, Value, -development_change, -geometry) %>%
  ggplot(., aes(development_change, Value, fill=development_change)) + 
    geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
    facet_wrap(~Variable) +
    scale_fill_manual(values = palette2,
                      labels=c("No Change","New Development"),
                      name="") +
    labs(title="New Development as a Function of Factor Variables") +
    plotTheme

```

From the above data exploration, we can see that development is positively correlated with proximity to highways and development, as well as with existing and growing population. However, this leaves us wondering which types of land cover are most likely to convert to developed land?

The conversion table shows us that forested areas in 2008 were the most likely to have been developed by 2019. This shows that Atlanta has a preference for developing woodlands first, with areas adjacent to those that are already developed as a close second.

```{r land cover conversion tables, fig.width=6, fig.height=6}
dat %>%
  dplyr::select(development_change:otherUndeveloped08,developed08) %>%
  gather(Land_Cover_Type, Value, -development_change, -geometry) %>%
   st_set_geometry(NULL) %>%
     group_by(development_change, Land_Cover_Type) %>%
     summarize(n = sum(as.numeric(Value))) %>%
     ungroup() %>%
    mutate(Conversion_Rate = paste0(round(100 * n/sum(n), 2), "%")) %>%
    filter(development_change == 1) %>%
  dplyr::select(Land_Cover_Type,Conversion_Rate) %>%
  reactable(theme = reactableTheme(style = list(fontFamily = "Arial, sans-serif", fontSize = "1.0rem")))
```

<br>

# Binary Logistic Regression - Predicting for 2019

Now that we have gathered a number of factors we hypothesized to be related to new development, and have proved that they have at least some association with development, we can build and train a model to predict new development based on them.

We chose to use binomial logistic regression for this assignment because it is a type of statistical model that is well suited to analyzing relationships between a binary response variable and one or more predictor variables. It estimates the probability of the response variable taking one of two possible values (in this case developed or not developed) based on the values of the predictor variables.

To predict the likelihood of future development, we use data from 2008/2009 to create a series of logistic regression models to predict land use change in 2019. We then evaluate the performance of these models to select our best model for predicting development in 2029.

## Model Building

The first step of this process is randomly splitting the data in half to create a testing data set with one half and a training data set with the other.

```{r training sets, results='hide'}
set.seed(3456)
trainIndex <- 
  createDataPartition(dat$developed08, p = .50,
                                  list = FALSE,
                                  times = 1)
datTrain <- dat[ trainIndex,]
datTest  <- dat[-trainIndex,]

```

We then build a series of binary logistic regression models based on our independent variables.

The six models we created are numbered to reflect increasing complexity. While Model 1 concentrates exclusively on land cover type and whether development change occurred between the two time periods, later models incorporate the spatial lag of development, population elements, and our distance variables.

```{r linear models and plotting their R sqaured values and test set histogram plot}
Model1 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08, 
              family="binomial"(link="logit"), data = datTrain)

Model2 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08, 
              family="binomial"(link="logit"), data = datTrain)
              
Model3 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop09, 
              family="binomial"(link="logit"), data = datTrain)          
              
Model4 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop09 + pop19, 
              family="binomial"(link="logit"), data = datTrain)              
            
Model5 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop09 + pop19 + pop_Change,
              family="binomial"(link="logit"), data = datTrain)              
              
Model6 <- glm(development_change ~ wetlands08 + forest08  + farm08 + otherUndeveloped08 + lagDevelopment08 + pop_Change + pop09 + pop19 + distance_highways, 
              family="binomial"(link="logit"), data = datTrain) 


modelList <- paste0("Model", 1:6)

map_dfc(modelList, function(x)pR2(get(x)))[4,] %>%
  setNames(paste0("Model",1:6)) %>%
  gather(Model,McFadden) %>%
  ggplot(aes(Model,McFadden)) +
    geom_bar(stat="identity", fill = highlight) +
    labs(title= "McFadden R-Squared by Model") +
    plotTheme

```

We then examine the McFadden R-Squared of the six models to select our best final model. Model 1 clearly performs more poorly than the others. However, models 2-4 also perform moderately less well than the remaining models. Therefore, we eliminate models 1-4 and choose Model 6 as our final model because it performs the best and allows us to explore the effects of all of our variables of interest.

We then use the predicted values from Model 6 to create a histogram of the model's test set predicted probabilities. This shows us that a high density of cells have roughly a 40% likelihood of development. This makes sense given the sprawling nature of the Atlanta MSA's existing development patterns, which are acreage intensive. It also makes sense given the map of new development we plotted earlier, in which it seems feasible that roughly /~30-40% of the MSA was covered by new development in 2019.

```{r Density plot of probabilities observed by class}
testSetProbs <- 
  data.frame(class = datTest$development_change,
             probs = predict(Model6, datTest, type="response")) 

chart_testProbs <-  
ggplot(testSetProbs, aes(probs)) +
  geom_density(aes(fill=class), alpha=0.75) +
  scale_fill_manual(values = palette2,
                    labels=c("No Change","New Development")) +
  labs(title = "Histogram of test set predicted probabilities",
       x="Predicted Probabilities",y="Density") +
  plotTheme

chart_testProbs
```

## Accuracy

This next segment tests our model's:

1)  Sensitivity: the extent to which it accurately predicts new development, or "true positives", and:
2)  Specificity, the extent to which it accurately predicts "true negatives."

Our ultimate goal with a land cover change model is to optimize the balance between the model's predictions with regard to both sensitivity and specificity. Meaning, we want the model to be capable of predicting where development occurs so that we can plan for those areas, but we also want it to accurately predict where development would not occur, to inform our other land use planning.

At the first two thresholds specified below, 5% and 17%, we can see that the 5% threshold very accurately predicts the "true negatives," where development does not occur. The threshold of 17%, however, performs better on the "true positive" rate of predicted v. actual development, that has occurred. We found, however, that a threshold of around 34% gives the best balance between the trade-offs of sensitivity and specificity. We chose 34% because it gives us the highest overall accuracy, and allows us to keep both our sensitivity and specificity over 50%.

```{r testing model sensitivity, fig.height=6}
options(yardstick.event_first = FALSE)

testSetProbs <- 
  testSetProbs %>% 
  mutate(predClass_05 = as.factor(ifelse(testSetProbs$probs >= 0.05 ,1,0)),
         predClass_17 = as.factor(ifelse(testSetProbs$probs >= 0.17 ,1,0)),
         predClass_34 = as.factor(ifelse(testSetProbs$probs >= 0.34 ,1,0))) 

testSetProbs %>%
  dplyr::select(-probs) %>%
  gather(Variable, Value, -class) %>%
  group_by(Variable) %>%
  summarize(Sensitivity = round(yardstick::sens_vec(class,factor(Value)),2),
            Specificity = round(yardstick::spec_vec(class,factor(Value)),2),
            Accuracy = round(yardstick::accuracy_vec(class,factor(Value)),2)) %>% 
  reactable(theme = reactableTheme(style = list(fontFamily = "Arial, sans-serif", fontSize = "1.0rem")))

```

We then convert these true/false positive/negative indicators to factors in order to map them.

```{r development predictions based on thresholds, fig.width=12, fig.height=10}
predsForMap <-         
  dat %>%
    mutate(probs = predict(Model6, dat, type="response") ,
           Threshold_5_Pct = as.factor(ifelse(probs >= 0.05 ,1,0)),
           Threshold_17_Pct =  as.factor(ifelse(probs >= 0.17 ,1,0)),
           Threshold_34_Pct =  as.factor(ifelse(probs >= 0.34 ,1,0))) %>%
    dplyr::select(development_change,Threshold_5_Pct,Threshold_17_Pct, Threshold_34_Pct) %>%
    gather(Variable,Value, -geometry) %>%
    st_cast("POLYGON")

ggplot() +
  geom_point(data=predsForMap, aes(x=xyC(predsForMap)[,1], y=xyC(predsForMap)[,2], colour=Value)) +
  facet_wrap(~Variable) +
  scale_colour_manual(values = c(gray, highlight), labels=c("No Change","New Development"),
                      name="") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title="Development Predictions - Low Threshold") + 
  mapTheme +
  theme(legend.position = "bottom")

```

These maps show that the 34% nicely captures the MSA's overall pattern of development while minimizing the tendency to over-predict development that the 17% and 5% thresholds share (the 5% threshold wildly so).

To better understand the trade-offs between accurately predicting false positives and false negatives, we can run our predictions through a confusion matrix to visualize the true positives ("TrueP") and true negatives ("TrueN") in the charts below, for each of the thresholds.

```{r development predictions based on confusion metrics, fig.width=10, fig.height=10}
dat <- dat %>% 
  mutate(probs = predict(Model6, dat, type="response"))
  
ConfusionMatrix.metrics <-
  dat %>%
    mutate(Threshold_5_Pct = as.factor(ifelse(probs >= 0.05 ,1,0)),
           Threshold_17_Pct =  as.factor(ifelse(probs >= 0.17 ,1,0)),
          Threshold_34_Pct =  as.factor(ifelse(probs >= 0.34 ,1,0))) %>%
    mutate(TrueP_05 = ifelse(development_change  == 1 & Threshold_5_Pct == 1, 1,0),
           TrueN_05 = ifelse(development_change  == 0 & Threshold_5_Pct == 0, 1,0),
           TrueP_17 = ifelse(development_change  == 1 & Threshold_17_Pct == 1, 1,0),
           TrueN_17 = ifelse(development_change  == 0 & Threshold_17_Pct == 0, 1,0),
           TrueP_34 = ifelse(development_change  == 1 & Threshold_34_Pct == 1, 1,0),
           TrueN_34 = ifelse(development_change  == 0 & Threshold_34_Pct == 0, 1,0)) %>%
    dplyr::select(., starts_with("True")) %>%
    gather(Variable, Value, -geometry) %>%
    st_cast("POLYGON") 

ggplot(data=ConfusionMatrix.metrics) +
  geom_point(aes(x=xyC(ConfusionMatrix.metrics)[,1], 
                 y=xyC(ConfusionMatrix.metrics)[,2], colour = as.factor(Value))) +
  facet_wrap(~Variable) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("Incorrect","Correct"),
                      name="") +
  geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .5) +
  geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .75) +
  labs(title="Development Predictions - Low Threshold") +
  mapTheme +
  theme(legend.position = "bottom")

```

Mapping the sensitivity (true positives) and specificity (true negatives) of these thresholds shows in further detail that the 34% threshold predicts noticeably true negatives than the other thresholds while predicting only slightly less true positives. Therefore, we select 34% as the probability threshold at which to classify land as likely to be developed in the remainder of this project.

# Planning Scenarios

Now that we have developed a predictive model for new development, we can use it to test the effects of potential future scenarios. We first use the model to predict how a demand-side change (projected population) might affect development in 2029. We then use the model to predict how a supply-side change (a new highway) might affect development

## Scenario 1: Demand-side Change Forecast

In this demand-side projection, we will use population projections for the Atlanta MSA counties and distribute the population among our fishnet cells. The goal is to visualize change and plan for growth scenarios based on how anticipated future population might drive development.

First, we replace the lag development 2008 column with the distance from development in 2019.

```{r lag dev 2019}
dat <-
  dat %>%
  mutate(lagDevelopment08 = nn_function(xyC(.), xyC(filter(., developed19 == 1)),2))
```

Next, we look at projected population by county in 2029 based on population estimates from the [Georgia Data Analytics Center](https://gdac.georgia.gov/census).

```{r projected pop 2029}
#County projections 2029
#Barrow 96977 -
#Bartow 119594 - 
#Carroll 131577 -
#Cherokee  301752-
#Clayton  321410-
#Cobb  830671-
#Coweta  172341 -
#Dawson    	32572-
#DeKalb  822154-
#Douglas  160635-
#Fayette  128627-
#Forsyth   316165-
#Fulton  1198334-
#Gwinnett   1044026-
#Hall   	232720-
#Henry   278739-
#Newton 130559-
#Paulding 208196-
#Rockdale 	95574-
#Spalding   70974-
#Walton 	109205-

countyPopulation_2029 <- 
  data.frame(
   NAME = Counties,
   county_projection_2029 = 
     c(96977,119594,131577,301752,321410,830671,172341,32572,822154,160635,128627,
       316165,1198334,1044026,232720,278739,130559,208196,95574,70974,109205)) %>%
   left_join(
     dat %>%
       st_set_geometry(NULL) %>%
       group_by(NAME) %>%
       summarize(county_population_2019 = round(sum(pop19))))

countyPopulation_2029 %>%
  gather(Variable,Value, -NAME) %>%
  ggplot(aes(reorder(NAME,-Value),Value)) +
  geom_bar(aes(fill=Variable), stat = "identity", position = "dodge") +
  scale_fill_manual(values = palette2,
                    labels=c("2019","2029"),
                    name="Population") +
  labs(title="Population Change by County: 2019 - 2029",
       x="County", y="Population") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  plotTheme
```

We see that Fulton, Gwinnett, DeKalb, and Cobb counties are projected to experience the largest population increases between 2019 and 2029.

#### Predicting Development Demand (based on projected population)

Now, we will take the county-level population projections and distribute them across the study area. We do this by weighting it proportionally according to a grid cell's 2019 population and then subtracting 2019 pop to get `pop_change`.

```{r predicted development demand 2029}
dat_infill <-
  dat %>%
  #calculate population change
    left_join(countyPopulation_2029) %>%
    mutate(proportion_of_county_pop = pop19 / county_population_2019,
           pop_2029.infill = proportion_of_county_pop * county_projection_2029,
           pop_Change = round(pop_2029.infill - pop19),2) %>%
    dplyr::select(-county_projection_2029, -county_population_2019, 
                  -proportion_of_county_pop, -pop_2029.infill) %>%
  #predict for 2029
    mutate(predict_2029.infill = predict(Model6,. , type="response"))

map_pred29 <- 
dat_infill %>%
  ggplot() +  
  geom_point(aes(x=xyC(dat_infill)[,1], y=xyC(dat_infill)[,2], colour = factor(ntile(predict_2029.infill,5)))) +
  scale_colour_manual(values = palette5,
                    labels=substr(quintileBreaks(dat_infill,"predict_2029.infill"),1,4),
                    name="% Likelihood\nof Development") +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title= "Development Demand in 2029: Predicted Probabilities",
    subtitle = "Based on population change projections")+
  mapTheme

map_pred29
```

Mapping predicted development based on projected population change for 2029 doesn't show any huge changes from where our model predicted development would be in 2019. New development is still predicted to most likely occur in the outer counties. Although, our model is predicting increased development in the northwest and southwest corners of the metro which it was not predicting before we added in the 2029 population projections.

<br>

## Scenario 2: Supply-side Change Forecast

To begin our supply-side projection, we imagine a new highway in Walton County, in the eastern part of the metro, where development between 2008 and 2019 was less concentrated. This was also a forested area, so the fact that much development had not happened there over the earlier period aroused our curiosity. This way, we can test the extent that a new road investment influences development in a desirable area of land cover. This new road was created as a line segment in ArcGIS and merged with the existing highways shapefile.

```{r import new roads files, results='hide'}
#NEW HIGHWAY SEGMENT ONLY
new_highway <- st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/a5a5b0c9099b97f68c0d1aeadcdfe9dbd2960a9a/Data/geojson/new_highway.geojson")

new_highway <- new_highway %>% 
st_transform(st_crs(metro_counties)) %>%
  st_intersection(metro_counties) 

#new_highway <- st_write(new_highway,"C:/Users/Lindsey/Desktop/CPLN675_Land_Use_Modeling_Desktop/Final/LUEM_A5_Repo/Data/geojson/new_highway.geojson")

#ALL HIGHWAYS MERGED TOGETHER -- highways_all --  

highways_all <- st_read("https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/a5a5b0c9099b97f68c0d1aeadcdfe9dbd2960a9a/Data/geojson/highways_all.geojson") %>% 
st_transform(st_crs(metro_counties)) %>%
  st_intersection(metro_counties)
```

```{r mapping the new highway addition, fig.width=8}

ggplot() +
  geom_point(data=dev_change_fishnet, 
             aes(x=xyC(dev_change_fishnet)$x, y=xyC(dev_change_fishnet)$y, colour=development_change)) +
  scale_colour_manual(values = c(gray, "#ED7953FF"),
                      labels=c("No Change","New Development"),
                      name = "") +
  labs(title = "Land Cover Development Change",
       subtitle = "Atlanta metro area | Fishnet centroids with existing highways\nin dark blue and a new highway in cyan") +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  geom_sf(data=AtlantaHighways, color = highlight, linewidth = .5) +
  geom_sf(data=new_highway, colour="cyan", linewidth=1) +
  mapTheme

```

Now, with the new highways dataset, we can create a new distance-to-highway raster, and from there convert it to fishnet centroids which we merge with the fishnet. This helps us understand how far the fishnet cells are from the new highway.

```{r create highway distance raster with new highway segment }

#new_highway_raster <- 
 #  as(highways_all,'Spatial') %>%
  # rasterize(.,emptyRaster)

#new_highway_raster_distance <- distance(new_highway_raster)
#writeRaster(new_highway_raster_distance, filename=file.path(rast_dir, "new_highway_dist.tif"), format="GTiff", overwrite=TRUE)

new_highway_raster_distance <- raster(paste(dir_git, "187ac7cf231257fc52de0f7f2a1dc2014e4e6f2d/Data/raster/new_highway_dist.tif", sep='/'))

names(new_highway_raster_distance) <- "distance_highways"

new_highwayPoints <-
  rasterToPoints(new_highway_raster_distance) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(AtlantaMSA_fishnet))

new_highwayPoints_fishnet <- 
  aggregate(new_highwayPoints, AtlantaMSA_fishnet, mean) %>%
  mutate(distance_highways = ifelse(is.na(distance_highways),0,distance_highways))

ggplot() +
  geom_sf(data=metro_counties) +
  geom_point(data=new_highwayPoints_fishnet, aes(x=xyC(new_highwayPoints_fishnet)[,1], 
                                             y=xyC(new_highwayPoints_fishnet)[,2], 
                 colour=factor(ntile(distance_highways,5))),size=1.5) +
  scale_colour_manual(values = palette5,
                      labels=substr(quintileBreaks(new_highwayPoints_fishnet,"distance_highways"),1,8),
                      name="Quintile\nBreaks") +
  geom_sf(data=highways_all, colour = "black") +
  geom_sf(data=new_highway, colour="cyan", linewidth=.75) +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title = "Distance to Highways",
       subtitle = "As fishnet centroids; Existing highways visualized in black; new highway in cyan") +
  mapTheme

```

We then use Model 6 to predict how adding a highway might affect development patterns in 2029, based on scenario 1. While this model does not show a true supply-side only effect, it is useful because it allows us to directly compare how adding in a supply-side factor affects the model's demand-side only predictions.

```{r predict with new highway, fig.width=8, fig.height=12}
dat_highway <-
  dat_infill %>%
  dplyr::select(-distance_highways) %>% 
  left_join(as.data.frame(new_highwayPoints_fishnet)) %>% 
  mutate(predict_2029.distance_highways = predict(Model6,. , type="response"))


Map_highway_pred <-
dat_highway %>%
  ggplot() +  
  geom_point(aes(x=xyC(dat_highway)[,1], y=xyC(dat_highway)[,2], colour = factor(ntile(predict_2029.distance_highways,5)))) +
  scale_colour_manual(values = palette5,
                    labels=substr(quintileBreaks(dat_highway,"predict_2029.distance_highways"),1,4),
                    name="% Likelihood\nof Development") +
  geom_sf(data=metro_counties, fill=NA, colour="white", size=.75) +
  geom_sf(data=highways_all, colour = "black") +
  geom_sf(data=new_highway, colour="cyan", linewidth=.75) +
  geom_sf(data = Atlanta_MSA, fill = "transparent",
          color = "black", linewidth = .75) +
  labs(title= "Development Supply in 2029: Predicted Probabilities",
       subtitle = "Based on projected development response to a new highway") +
  mapTheme

grid.arrange(ncol=1,
             map_pred29,
             Map_highway_pred
             )
```

While the change is slight, we see that adding in the new highway does slightly increase the predicted likelihood of development in the area around it.

# Allocation

Now that we better understand the effects of demand-side factors (like population growth and proximity to developed areas) and supply-side factors (like new infrastructure available land), we can analyze these factors by county. This allows us to better understand which counties in the Atlanta MSA are better suited to development than others. It also allows us to see which land use and development policies might be best suited to this region.

Let's first orient ourselves to how each kind of land cover was distributed in 2019.

```{r aggregate 2019 rasters, fig.width=10, fig.height=10}
dat2 <-
  aggregateRaster(theRasterList19, dat) %>%
  dplyr::select(developed19, forest19, farm19, wetlands19, otherUndeveloped19, water19) %>%
  st_set_geometry(NULL) %>%
  bind_cols(.,dat) %>%
  st_sf() %>%
  st_cast("POLYGON")

rasters19_map <- aggregatedRasters19 %>%
  gather(var,value,developed19:water19) %>%
  st_cast("POLYGON") %>%    #just to make sure no weird geometries slipped in
  mutate(X = xyC(.)$x,
         Y = xyC(.)$y) %>%
  ggplot() +
    geom_sf(data=metro_counties) +
    geom_point(aes(X,Y, colour=as.factor(value))) +
    facet_wrap(~var) +
    scale_colour_manual(values = c(gray, highlight),
                        labels=c("False","True"),
                        name = "") +
    geom_sf(data=metro_counties,
          color = "white",
          fill = "transparent",
          linewidth = .25) +
    geom_sf(data=Atlanta_MSA,
          color = "black",
          fill = "transparent",
          linewidth = .5) +
    labs(title = "Land Cover Types, 2019",
         subtitle = "As fishnet centroids") +
   mapTheme +
   theme()  +
   theme(legend.position = "none")

rasters19_map
```

We see that much of the metro area, including developed areas, is forested - meaning it has a large amount of tree cover. Farms and other undeveloped lands tend to be located more on the outer edges. Wetlands on the other hand are concentrated mostly in the metro region's southern counties, along with a number of small ponds and rivers. The largest water bodies, however, are located in the north of the metro.

Mapping these land cover types separately allows us to see that some counties may be better suited to development, while others have more sensitive land cover that should be protected.

However, this method does not allow us to pinpoint where these locations are with much specificity.

## Sensitive Land Cover Lost

To better understand where ecologically important lands are being lost for development, we create an indicator layer called `sensitive_lost19`. This new layer shows where forest or wetlands were lost to development between 2009 and 2019.

```{r sensitive landcover loss, fig.width=8}
dat2 <-
  dat2 %>%
   mutate(sensitive_lost19 = ifelse(forest08 == 1 & forest19 == 0 |
                                    wetlands08 == 1 & wetlands19 == 0,1,0)) %>% 
  st_transform("ESRI:102267")
                      
ggplot() +
  geom_point(data=dat2, aes(x=xyC(dat2)[,1], y=xyC(dat2)[,2], colour=as.factor(sensitive_lost19))) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("No Change","Sensitive Lost"),
                      name = "") +
  geom_sf(data = metro_counties, fill = "transparent", color = "white", linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent", color = "black", linewidth = .75) +
  labs(title = "Sensitive lands lost: 2009 - 2019",
       subtitle = "As fishnet centroids") +
  mapTheme
```

Interestingly, we see that more sensitive areas were lost to development near the metro's center during this period.

## Landscape Fragmentation

To contextualize these sensitive locations, we group areas where the `wetlands19` and `forest11` rasters are contiguous to create a `sensitive_regions` layer containing all sensitive regions with an area greater than 1 acre.

```{r sensitive regions, fig.width=8}
sensitiveRegions <- 
  raster::clump(wetlands19 + forest19) %>% #units are meters
  rasterToPolygons() %>%
  st_as_sf() %>%
  group_by(clumps) %>% 
  summarize() %>%
    mutate(Acres = as.numeric(st_area(.) * 0.00024711)) %>% #There are 0.00024711 acres per square meter and 4046.86 square meters in an acre
    filter(Acres > 4046.86)  %>%
  dplyr::select() %>%
  raster::rasterize(.,emptyRaster) 

sensitiveRegions[sensitiveRegions > 0] <- 1  
names(sensitiveRegions) <- "sensitiveRegions"

dat2 <-
  aggregateRaster(c(sensitiveRegions), dat2) %>%
  dplyr::select(sensitiveRegions) %>%
  st_set_geometry(NULL) %>%
  bind_cols(.,dat2) %>%
  st_sf()

ggplot() +
  geom_point(data=dat2, aes(x=xyC(dat2)[,1], y=xyC(dat2)[,2], colour=as.factor(sensitiveRegions))) +
  scale_colour_manual(values = c(gray, highlight),
                      labels=c("Other","Sensitive Regions"),
                      name="") +
  geom_sf(data = metro_counties, fill = "transparent", color = "white", linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent", color = "black", linewidth = .75) +
  labs(title = "Sensitive regions",
       subtitle = "Continous areas of either wetlands or forests\ngreater than 1 acre") +
  mapTheme
```

This gives us a much more clear understanding of which areas might be most sensitive to development than mapping each type of land cover separately did. We see that most of the counties along the outer border of the Atlanta metro region still have relatively low landscape fragmentation. This suggests that these counties should prioritize infill development over suburban sprawl.

## Summarize by County

To better understand how these characteristics compare across counties, we create a matrix of demand side and suitability factors.

This matrix allows us to better understand each county's suitability for development.

```{r county specific metrics}
county_specific_metrics <- 
  dat2 %>%
  #predict development demand from our model
  mutate(Development_Demand = predict(Model6, dat2, type="response")) %>%
  #get a count count of grid cells by county which we can use to calculate rates below
  left_join(st_set_geometry(dat, NULL) %>% group_by(NAME) %>% summarize(count = n())) %>%
  #calculate summary statistics by county
  group_by(NAME) %>%
  summarize(Total_Farmland = sum(farm19) / max(count),
            Total_Forest = sum(forest19) / max(count),
            Total_Wetlands = sum(wetlands19) / max(count),
            Total_Undeveloped = sum(otherUndeveloped19) / max(count),
            Sensitive_Land_Lost = sum(sensitive_lost19) / max(count),
            Sensitive_Regions = sum(sensitiveRegions) / max(count),
            Mean_Development_Demand = mean(Development_Demand)) %>%
  #get population data by county
  left_join(countyPopulation_2029 %>% 
            mutate(Population_Change = county_projection_2029 - county_population_2019,
                   Population_Change_Rate = Population_Change / county_projection_2029) %>%
            dplyr::select(NAME,Population_Change_Rate))
```

```{r plot county specific metrics, fig.width=12, fig.height=11}
county_specific_metrics %>%
  gather(Variable, Value, -NAME, -geometry) %>%
  mutate(Variable = factor(Variable, levels=c("Population_Change_Rate","Mean_Development_Demand",
                                              "Total_Farmland","Total_Undeveloped","Total_Forest",
                                              "Total_Wetlands","Sensitive_Land_Lost","Sensitive_Regions",
                                              ordered = TRUE))) %>%
  mutate(Planning_Designation = case_when(
    Variable == "Population_Change_Rate" | Variable == "Mean_Development_Demand" ~ "Demand-Side",
    Variable == "Total_Farmland" | Variable == "Total_Undeveloped"               ~ "Suitable",
    TRUE                                                                         ~ "Not Suitable")) %>%
  ggplot(aes(x=Variable, y=Value, fill=Planning_Designation)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black") +
    facet_wrap(~NAME, ncol=5) +
    coord_flip() +
    scale_y_continuous(breaks = seq(.25, 1, by = .25)) +
    geom_vline(xintercept = 2.5) + geom_vline(xintercept = 4.5) +
    scale_fill_manual(values=c("#0D0887FF","#BD3786FF","#FDC926FF")) +
    labs(title= "County Specific Allocation Metrics", subtitle= "As rates", x="Indicator", y="Rate") +
    plotTheme + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom")
```

This chart shows three types of metrics:  
-   Demand-side factors  
-   Factors suitable for development, and  
-   Factors not suitable for development  

The demand side factors are `Mean_Development_Demand` and `Population_Change_Rate`. `Mean_Development_Demand` is the mean value of predicted development demand for a given county in 2029. `Population_Change_Rate` is the share of a county's population that is projected to move-in between 2019 and 2029.

Factors suitable for development are area of `Total_Farmland` and area of `Total_Undeveloped` land in a given county.

Factors not suitable for development are area of `Sensitive_Regions`, `Sensitive_Land_Lost`, and `Total_Wetlands` in a county.

In determining our allocation criteria, we determined any area with a wetland was not suitable for development. Attending to the externalities of urban sprawl, we chose to preserve these vital ecosystems. When it comes to forests, however, given that the Atlanta MSA has such extensive tree cover, we determined that forested areas are suitable for development.

Looking at the development suitability indicators across the 21 counties of Georgia's MSA, we see that all counties have a large percentage of forested land and therefore of sensitive regions. Looking at the rest of the indicators together however, we can see that some counties are better suited for development than others.

For example, we see that Hall County has both a high rate of development demand and of population change. It also has a relatively large amount of farmland and a relatively low amount of total wetlands and sensitive land lost. Together, these factors suggest that Hall County has ample space for development that will not impact sensitive areas (other than forest).

The satellite image below of development in Hall County shows why there is a large amount of forested land cover across the Atlanta metro region. There seems to be a regional preference for dispersed development in forested land.

![](https://raw.githubusercontent.com/c-townsley/LUEM_A5_Repo/ffa083e7e1a63b11e033f386ec7fe00b2d3414d3/Data/Images/Satellite_HallCounty.png)



On the other hand, Clayton County appears to be poorly suited for development because it has relatively high development demand, is experiencing population growth, has a low amount of farmland and undeveloped land, and has a relatively high amount of wetlands.

Let's look more closely at Hall County (in the North) and Clayton County (to the South) to see what these traits look like spatially.

```{r Hall and Clayton Counties}
ggplot() +
  geom_sf(data = metro_counties, fill = gray, color = "white", linewidth = .5) +
  geom_sf(data = filter(metro_counties, NAME=="Hall" | NAME=="Clayton"), fill = "#FDC926FF", color = "white", linewidth = .5) +
  geom_sf(data = Atlanta_MSA, fill = "transparent", color = "black", linewidth = .75) +
  labs(title = "Atlanta MSA",
       subtitle = "Hall and Clayton Counties Highlighted") +
  mapTheme
```

## Hall County

First, we map supply and demand factors for Hall County, to examine an example of a county well suited to further development.

```{r hall county, fig.width=12, fig.height=6}
HallCounty <-
  dat2 %>%
    mutate(Development_Demand = predict(Model6, dat2, type="response")) %>%
    filter(NAME == "Hall") %>%
    rename("developed19" = developed19...2)

HallCounty_landUse <- rbind(
  filter(HallCounty, wetlands19 == 1 ) %>%
  dplyr::select() %>% mutate(Land_Use = "Not Suitable"),
  filter(HallCounty, developed19 == 1) %>%
  dplyr::select() %>% mutate(Land_Use = "Developed"))

Map_Hall1 <-
ggplot() +
  geom_sf(data=HallCounty, aes(fill=factor(ntile(Development_Demand,5))), colour=NA) +
  geom_point(data=HallCounty_landUse, aes(x=xyC(HallCounty_landUse)[,1], 
                                        y=xyC(HallCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 3) +
  geom_sf(data=st_intersection(AtlantaHighways,filter(metro_counties, NAME=="Hall")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Development\nDemand",
                    labels=substr(quintileBreaks(HallCounty,"Development_Demand"),1,5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Hall"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Development Potential, 2029: Hall County",
       subtitle = "With Developed and Unsuitable Areas from 2019") +
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2))

Map_Hall2 <-
ggplot() +
  geom_sf(data=HallCounty, aes(fill=factor(ntile(pop_Change,5))), colour=NA) +
  geom_point(data=HallCounty_landUse, aes(x=xyC(HallCounty_landUse)[,1], 
                                        y=xyC(HallCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 3) +
  geom_sf(data=st_intersection(AtlantaHighways, filter(metro_counties, NAME=="Hall")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Population\nChange",
                    labels=substr(quintileBreaks(HallCounty,"pop_Change"),1,5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Hall"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Projected Population, 2029: Hall County",
       subtitle = "With Developed and Unsuitable Areas from 2019") +
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2))

grid.arrange(
  Map_Hall1,
  Map_Hall2,
  ncol=2)
```

Mapping Hall County's development potential and projected population with developed land and wetlands (the "not suitable" land) shows us that the County still has a significant amount of area that is suitable for development. In particular, this county would do well to prioritize infill development between highways in its northwest and southern portions where there is predicted to be both high development demand and significant population growth. This would also help to reduce forest fragmentation and pressure on wetlands in the county's east and northeast.

## Clayton County

Next, we map the same supply and demand factors for Clayton County to examine an example of a county not well suited to further development.

```{r Clayton County, fig.width=12, fig.height=10}
ClaytonCounty <-
  dat2 %>%
    mutate(Development_Demand = predict(Model6, dat2, type="response")) %>%
    filter(NAME == "Clayton") %>%
    rename("developed19" = developed19...2)

ClaytonCounty_landUse <- rbind(
  filter(ClaytonCounty, wetlands19 == 1 ) %>%
  dplyr::select() %>% mutate(Land_Use = "Not Suitable"),
  filter(ClaytonCounty, developed19 == 1) %>%
  dplyr::select() %>% mutate(Land_Use = "Developed"))

grid.arrange(
ggplot() +
  geom_sf(data=ClaytonCounty, aes(fill=factor(ntile(Development_Demand,5))), colour=NA) +
  geom_point(data=ClaytonCounty_landUse, aes(x=xyC(ClaytonCounty_landUse)[,1], 
                                        y=xyC(ClaytonCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 6) +
  geom_sf(data=st_intersection(AtlantaHighways,filter(metro_counties, NAME=="Clayton")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Development\nDemand",
                    labels=substr(quintileBreaks(ClaytonCounty,"Development_Demand"),1,5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Clayton"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Development Potential, 2029: Clayton County",
       subtitle = "With Developed and Unsuitable Areas from 2019") + 
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2)),

ggplot() +
  geom_sf(data=ClaytonCounty, aes(fill=factor(ntile(pop_Change,5))), colour=NA) +
  geom_point(data=ClaytonCounty_landUse, aes(x=xyC(ClaytonCounty_landUse)[,1], 
                                        y=xyC(ClaytonCounty_landUse)[,2], colour=Land_Use),
                                        shape = 15, size = 6) +
  geom_sf(data=st_intersection(AtlantaHighways, filter(metro_counties, NAME=="Clayton")), linewidth=1, color = darkGray) +
  scale_fill_manual(values = palette5, name="Population\nChange",
                    labels=substr(quintileBreaks(ClaytonCounty, "pop_Change"), 1, 5)) +
  scale_colour_manual(values = c(gray,"#F0F921FF")) + 
  geom_sf(data = filter(metro_counties, NAME=="Clayton"), fill = "transparent", color = "black", linewidth = 1) +
  labs(title = "Projected Population, 2029: Clayton County",
      subtitle = "With Developed and Unsuitable Areas from 2019") + 
  mapTheme +
  guides(fill = guide_legend(order = 1), colour = guide_legend(order = 2)), ncol=2)
```

In contrast to Hall County's relatively large amount of land that is suitable for development, we see that Clayton County has almost none. The county is almost entirely developed and the two areas in the southern portion of the county that are not developed are dominated by wetland areas that are not suitable for development.

Furthermore, the map shows that the remaining cells are all near wetland areas or not expected to see population growth. Therefore, Clayton County is an ideal location for conservation rather than development in the Atlanta metro area.

# Conclusion

Thank you for walking through our model. We conclude by summarizing the strengths, weaknesses, and planning implications of this approach.

**Strengths of approach**  
While not perfect, the strengths of this approach are that it allows us to estimate future urban sprawl based on land cover and population characteristics of a given area. Being able to visually analyze development potential through maps and charts has great utility for communicating the complex nature of urban sprawl to everyone from policymakers to community members with non-technical backgrounds.

**Weaknesses**  
While an incredibly useful tool, because this approach only takes inputs from two study years, it has the potential to predict future development based on a small historical sample and anomalous trends. Furthermore, while the maps and charts are helpful for communication, they are estimations at best, and only one piece of the land allocation toolkit. The convincing nature of these graphics risks communicating these estimates as fact. Finally, the model is only as good as the variables within it. In future studies, we could improve our model by testing additional predictor variables such as school districts, demographic information, and real estate values.

**Policy implications**  
Based on the results of our analysis, the 21 county Atlanta metro region would benefit from conservation easements to prevent further landscape fragmentation. This would protect the vitality of forest ecosystems over the long-term. To calibrate against further landscape fragmentation, this analysis suggests that policymakers could implement an urban growth boundary in the fringe counties to protect the metro's remaining sensitive regions before they are fully developed. This would have the added benefit of encouraging infill development in the MSA.

```{r export charts, include = FALSE, eval = FALSE}
dir_ct_graphics <- "C:/Users/ctown/OneDrive - PennO365/Classes/Classes_Sem4_2023Spring/CPLN 675_Land Modeling/Assignments/A5_SprawlForecasting/Graphics"

ggsave(paste(dir_ct_graphics, "HallCounty1.pdf", sep='/'), plot = Map_Hall1, width = 11)
ggsave(paste(dir_ct_graphics, "HallCounty2.pdf", sep='/'), plot = Map_Hall2, width = 11)

ggsave(paste(dir_ct_graphics, "HighwayPred.pdf", sep='/'), plot = Map_highway_pred, width = 11)
ggsave(paste(dir_ct_graphics, "PopPredMap.pdf", sep='/'), plot = map_pred29, width = 11)

ggsave(paste(dir_ct_graphics, "Chart_TestProbs.pdf", sep='/'), plot = chart_testProbs, width = 11)

ggsave(paste(dir_ct_graphics, "Landcover08.pdf", sep='/'), plot = rasters08_map, width = 11)
```
